0401
0410 044f
0451
-2016
+2015 2016
2020 2021
203b
2074
25b6 25b7
25bc 25bd
25c0 25c1
-25c6 25c7
+25c6 25c8
25cb
25ce 25d1
25e2 25e5
2667 266a
266c 266d
266f
-e000 f8ff
fffd
END
1fdd 1fef
1ff2 1ff4
1ff6 1ffe
-2015
2017
2022 2023
2038
25b8 25bb
25be 25bf
25c2 25c5
-25c8 25ca
+25c9 25ca
25cc 25cd
25d2 25e1
25e6 25ee
fd50 fd8f
fd92 fdc7
fdf0 fdfb
-fe6b
fe70 fe72
fe74
fe76 fefc
0009
007c
00ad
-00b4
058a
0f0b
1361
# This file is built by mktables.PL from e.g. Unicode.txt.
# Any changes made here will be lost!
return <<'END';
+00b4
02c8
02cc
1806
3300 3376
337b 33dd
33e0 33fe
-3400 4db5
-4e00 9fa5
a000 a48c
a490 a4a1
a4a4 a4b3
a4b5 a4c0
a4c2 a4c4
a4c6
-ac00 d7a3
f900 fa2d
fe30 fe34
fe49 fe4f
fe58
fe5f fe66
fe68
+fe6b
ff02 ff03
ff06 ff07
ff0a ff0b
ff5c
ff5e
ffe2 ffe4
-20000 2a6d6
2f800 2fa1d
END
# This file is built by mktables.PL from e.g. Unicode.txt.
# Any changes made here will be lost!
return <<'END';
-d800 db7f
-db80 dbff
-dc00 dfff
END
# - Assigned characters that are not listed explicitly are given the value
# "AL".
# - Unassigned characters are given the value "XX".
-# - Characters within ranges marked by "First>" and "Last>" are omitted,
-# as in UnicodeData.txt. For example, the following means that
-# all characters between 3400 and 4DB5 have the value "ID"
-# 3400;ID;<CJK Ideograph Extension A, First>
-# 4DB5;ID;<CJK Ideograph Extension A, Last>
+# - Characters ranges are specified as for other property files in
+# the Unicode Character Database.
#
# The Unicode name of each character is provided in a comment for help
# in identifying the characters.
00B1;PR # PLUS-MINUS SIGN
00B2;AI # SUPERSCRIPT TWO
00B3;AI # SUPERSCRIPT THREE
-00B4;BA # ACUTE ACCENT
+00B4;BB # ACUTE ACCENT
00B5;AL # MICRO SIGN
00B6;AI # PILCROW SIGN
00B7;AI # MIDDLE DOT
2012;BA # FIGURE DASH
2013;BA # EN DASH
2014;B2 # EM DASH
-2015;AL # HORIZONTAL BAR
+2015;AI # HORIZONTAL BAR
2016;AI # DOUBLE VERTICAL LINE
2017;AL # DOUBLE LOW LINE
2018;QU # LEFT SINGLE QUOTATION MARK
25C5;AL # WHITE LEFT-POINTING POINTER
25C6;AI # BLACK DIAMOND
25C7;AI # WHITE DIAMOND
-25C8;AL # WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND
+25C8;AI # WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND
25C9;AL # FISHEYE
25CA;AL # LOZENGE
25CB;AI # WHITE CIRCLE
33FC;ID # IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY TWENTY-NINE
33FD;ID # IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY
33FE;ID # IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
-3400;ID # <CJK Ideograph Extension A, First>
-4DB5;ID # <CJK Ideograph Extension A, Last>
-4E00;ID # <CJK Ideograph, First>
-9FA5;ID # <CJK Ideograph, Last>
+3400..4DB5;ID # <CJK Ideograph Extension A, First>..<CJK Ideograph Extension A, Last>
+4E00..9FA5;ID # <CJK Ideograph, First>..<CJK Ideograph, Last>
A000;ID # YI SYLLABLE IT
A001;ID # YI SYLLABLE IX
A002;ID # YI SYLLABLE I
A4C3;ID # YI RADICAL CHE
A4C4;ID # YI RADICAL ZZIET
A4C6;ID # YI RADICAL KE
-AC00;ID # <Hangul Syllable, First>
-D7A3;ID # <Hangul Syllable, Last>
-D800;SG # <Non Private Use High Surrogate, First>
-DB7F;SG # <Non Private Use High Surrogate, Last>
-DB80;SG # <Private Use High Surrogate, First>
-DBFF;SG # <Private Use High Surrogate, Last>
-DC00;SG # <Low Surrogate, First>
-DFFF;SG # <Low Surrogate, Last>
-E000;AI # <Private Use, First>
-F8FF;AI # <Private Use, Last>
+AC00..D7A3;ID # <Hangul Syllable, First>..<Hangul Syllable, Last>
+D800..DB7F;SG # <Non Private Use High Surrogate, First>..<Non Private Use High Surrogate, Last>
+DB80..DBFF;SG # <Private Use High Surrogate, First>..<Private Use High Surrogate, Last>
+DC00..DFFF;SG # <Low Surrogate, First>..<Low Surrogate, Last>
+E000..F8FF;XX # <Private Use, First>..<Private Use, Last>
F900;ID # CJK COMPATIBILITY IDEOGRAPH-F900
F901;ID # CJK COMPATIBILITY IDEOGRAPH-F901
F902;ID # CJK COMPATIBILITY IDEOGRAPH-F902
FE68;ID # SMALL REVERSE SOLIDUS
FE69;PR # SMALL DOLLAR SIGN
FE6A;PO # SMALL PERCENT SIGN
-FE6B;AL # SMALL COMMERCIAL AT
+FE6B;ID # SMALL COMMERCIAL AT
FE70;AL # ARABIC FATHATAN ISOLATED FORM
FE71;AL # ARABIC TATWEEL WITH FATHATAN ABOVE
FE72;AL # ARABIC DAMMATAN ISOLATED FORM
1D7FD;NU # MATHEMATICAL MONOSPACE DIGIT SEVEN
1D7FE;NU # MATHEMATICAL MONOSPACE DIGIT EIGHT
1D7FF;NU # MATHEMATICAL MONOSPACE DIGIT NINE
-20000;ID # <CJK Ideograph Extension B, First>
-2A6D6;ID # <CJK Ideograph Extension B, Last>
+20000..2A6D6;ID # <CJK Ideograph Extension B, First>..<CJK Ideograph Extension B, Last>
2F800;ID # CJK COMPATIBILITY IDEOGRAPH-2F800
2F801;ID # CJK COMPATIBILITY IDEOGRAPH-2F801
2F802;ID # CJK COMPATIBILITY IDEOGRAPH-2F802
E007D;CM # TAG RIGHT CURLY BRACKET
E007E;CM # TAG TILDE
E007F;CM # CANCEL TAG
+F0000..FFFFD;XX # <Plane 15 Private Use, First>..<Plane 15 Private Use, Last>
+100000..10FFFD;XX # <Plane 16 Private Use, First>..<Plane 16 Private Use, Last>
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
+
+ "http://www.w3.org/TR/REC-html40/loose.dtd">
+
<html>
<head>
-<meta name="GENERATOR" content="Microsoft FrontPage 3.0">
-<title>Unicode 3.0 NamesList File Structure</title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<meta http-equiv="Content-Language" content="en-us">
+<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
+<meta name="ProgId" content="FrontPage.Editor.Document">
+<meta name="keywords"
+content="unicode, normalization, composition, decomposition">
+<meta name="description" content="Specifies the Unicode Normalization Formats">
+<title>UCD: Unicode NamesList File Format</title>
+<link rel="stylesheet" type="text/css" href="http://www.unicode.org/unicode.css">
+<style type="text/css">
+
+<!--
+
+.foo { }
+-->
+
+</style>
</head>
-<body>
-
-<h3>Unicode NamesList File Format</h3>
-
-<p>Last updated: 1999-07-06</p>
-
-<h3>1.0 Introduction</h3>
+<body bgcolor="#ffffff">
+
+<table width="100%" cellpadding="0" cellspacing="0" border="0">
+ <tr>
+ <td>
+ <table width="100%" border="0" cellpadding="0" cellspacing="0">
+ <tr>
+ <td class="icon"><a href="http://www.unicode.org"><img border="0"
+ src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
+ alt="[Unicode]" width="34" height="33"></a> <a
+ class="bar" href="UnicodeCharacterDatabase-3.1.0.html">Unicode Character
+ Database</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="gray"> </td>
+ </tr>
+</table>
+ <h1>Unicode NamesList File Format</h1>
+<table height="87" cellSpacing="2" cellPadding="0" width="100%" border="1">
+ <tbody>
+ <tr>
+ <td vAlign="top" width="144">Revision</td>
+ <td vAlign="top">3.1</td>
+ </tr>
+ <tr>
+ <td vAlign="top" width="144">Authors</td>
+ <td vAlign="top">Asmus Freytag</td>
+ </tr>
+ <tr>
+ <td vAlign="top" width="144">Date</td>
+ <td vAlign="top">2001-02-26</td>
+ </tr>
+ <tr>
+ <td vAlign="top" width="144">This Version</td>
+ <td vAlign="top"><a href="http://http://www.unicode.org/Public/3.1-Update/NamesList-2.html">http://www.unicode.org/Public/3.1-Update/NamesList-2.html</a></td>
+ </tr>
+ <tr>
+ <td vAlign="top" width="144">Previous Version</td>
+ <td vAlign="top"><a href="http://http://www.unicode.org/Public/3.0-Update/NamesList-1.html">http://www.unicode.org/Public/3.0-Update/NamesList-1.html</a></td>
+ </tr>
+ <tr>
+ <td vAlign="top" width="144">Latest Version</td>
+ <td vAlign="top"><a href="http://www.unicode.org/Public/UNIDATA/NamesList.html">http://www.unicode.org/Public/UNIDATA/NamesList.html</a></td>
+ </tr>
+ </tbody>
+</table>
+<h3>
+<br>
+<i>Summary</i></h3>
+<blockquote>
+ <p>This file describes the format and contents of NamesList.txt</p>
+</blockquote>
+<h3><i>Status</i></h3>
+<blockquote>
+<p>
+<i>The file and the files described herein are part of the <a href="UnicodeCharacterDatabase-3.1.0.html"> Unicode Character Database</a>
+(UCD)
+and are governed by the <a href="#Terms of Use">UCD Terms of Use</a> stated at the end.</i></p>
+</blockquote>
+ <hr width="50%">
+
+<h2>1.0 Introduction</h2>
<p>The Unicode name list file NamesList.txt (also NamesList.lst) is a plain text file used
to drive the layout of the character code charts in the Unicode Standard. The information
| CHAR_ENTRY NOTICE
</strong></pre>
-<p>In other words:<br>
+<p>In other words:<br>
<br>
-Neither TITLE nor SUBTITLE may occur after the first BLOCKHEADER. </p>
+Neither TITLE nor SUBTITLE may occur after the first BLOCKHEADER. </p>
-<p>Only TITLE, SUBTITLE, SUBHEADER, PAGEBREAK, COMMENT_LINE, and IGNORED_LINE may
-occur before the first BLOCKHEADER.</p>
+<p>Only TITLE, SUBTITLE, SUBHEADER, PAGEBREAK, COMMENT_LINE, and IGNORED_LINE may
+occur before the first BLOCKHEADER.</p>
<p>Directly following either a NAME_LINE or a RESERVED_LINE an uninterrupted sequence of
the following lines may occur (in any order and repeated as often as needed): ALIAS_LINE,
// blank page, then output one or more charts
// followed by the list of character names.
// use BLOCKSTART and BLOCKEND to define the
- // what characters belong to a block
+ // characters belonging to a block
// use blockname in page and table headers
<strong> "@@" <tab> BLOCKSTART <tab> BLOCKNAME COMMENT <tab> BLOCKEND
</strong>// if a comment is present it replaces the blockname
// character corresponding to char
// If character is combining, it is replaced with
// CHAR NBSP <circ> x NBSP where <circ> is the
- // dotted circle</small>
-</pre>
+ // dotted circle</small></pre>
+
+<p><strong>Notes:</strong>
+
+</p>
+
+<ul>
+ <li>Blocks must be aligned on 16-code point boundary and contain an integer
+ multiple of code points. The exception to that rule is for blocks of
+ ideographs etc. for which no names are listed in the file. Such blocks must
+ end on the actual last character.</li>
+ <li>Blocks must be non-overlapping and in ascending order. Namelines
+ must be in ascending order and following the block header for the block to
+ which they belong.</li>
+ <li>Reserved entries are optional, and will be supplied automatically. They
+ are required whenever followed by ALIAS_LINE, COMMENT_LINE or CROSS_REF</li>
+</ul>
<h3><strong>1.4 NamesList File Primitives</strong></h3>
<p>The following are the primitives and terminals for the NamesList syntax.</p>
-<pre><small><strong>LINE: STRING LF
-COMMENT: "(" NAME ")"
- "(" NAME ")" "*"
-</strong>
-<strong>NAME</strong>: <sequence of ASCII characters, except "(" or ")" >
+<pre><strong><small>LINE: STRING LF
+COMMENT: "(" NAME ")"
+ "(" NAME ")" "*" </small></strong><small>
+<strong>BLOCKNAME:</strong> <sequence of Latin-1 characters, except "(" and ")">
+<strong>NAME</strong>: <sequence of uppercase ASCII letters, digit and hyphen>
<strong>STRING</strong>: <sequence of Latin-1 characters>
<strong>CHAR</strong>: <strong>X X X X</strong>
- <strong>| X X X X X X X X X</strong></small>
+ <strong>| X X X X X</strong>
+ <strong>| X X X X X X</strong></small>
<small><strong>X: "0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"|"A"|"B"|"C"|"D"|"E"|"F"
<tab>:</strong> <sequence of one or more ASCII tab characters 0x09>
<strong>SP</strong>: <ASCII 0x20>
<ul>
<li>Special lookahead logic prevents a mention of a 4 digit standard, such as ISO 9999 from
- being misinterpreted as ISO CHAR.</li>
+ being misinterpreted as ISO CHAR. The - in a character range CHAR-CHAR is
+ replaced by an EN DASH.</li>
<li>Use of Latin-1 is supported in unibook.exe, but not portably, unless the file is encoded as
UTF-16LE.</li>
<li>The final LF in the file must be present</li>
- <li>A CHAR inside ' or " is expanded, but only its glyph image is printed, the
- code value is not echoed</li>
- <li>Straight quotes in an EXPAND_LINE are replaced by curly quotes using English rules.
- Apostrophes are supported, but nested quotes are not.</li>
+ <li>A CHAR inside ' or " is expanded, but only its glyph image is printed,
+ the
+ code value is not echoed.</li>
+ <li>Straight quotes in an EXPAND_LINE are replaced by curly quotes using English rules.
+ Apostrophes are supported, but nested quotes are not.</li>
</ul>
-</body>
-</html>
+<h2>Modifications</h2>
+<p>Use of 4-6 digit hex notation is now supported.</p>
+ <hr width="50%">
+<h2>
+UCD <a name="Terms of Use">Terms of Use</a></h2>
+<h3>
+<i>Disclaimer</i></h3>
+<blockquote>
+ <p><i>The Unicode Character Database is provided as is by Unicode, Inc. No
+ claims are made as to fitness for any particular purpose. No warranties of any
+ kind are expressed or implied. The recipient agrees to determine applicability
+ of information provided. If this file has been purchased on magnetic or
+ optical media from Unicode, Inc., the sole remedy for any claim will be
+ exchange of defective media within 90 days of receipt.</i></p>
+ <p><i>This disclaimer is applicable for all other data files accompanying the
+ Unicode Character Database, some of which have been compiled by the Unicode
+ Consortium, and some of which have been supplied by other sources.</i></p>
+</blockquote>
+<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
+<blockquote>
+ <p><i>Recipient is granted the right to make copies in any form for internal
+ distribution and to freely use the information supplied in the creation of
+ products supporting the Unicode<sup>TM</sup> Standard. The files in the
+ Unicode Character Database can be redistributed to third parties or other
+ organizations (whether for profit or not) as long as this notice and the
+ disclaimer notice are retained. Information can be extracted from these files
+ and used in documentation or programs, as long as there is an accompanying
+ notice indicating the source.</i></p>
+</blockquote>
+ <hr width="50%">
+ <div align="center">
+ <center>
+ <table cellspacing="0" cellpadding="0" border="0">
+ <tr>
+ <td><a href="../../../../../../index.html"><img
+ src="http://www.unicode.org/img/hb_home.gif" border="0"
+ alt="Home" width="40" height="49"></a><a
+ href="../copyright.html"><img
+ src="http://www.unicode.org/img/hb_mid.gif" border="0"
+ alt="Terms of Use" width="152" height="49"></a><a
+ href="mailto:info@unicode.org"><img
+ src="http://www.unicode.org/img/hb_mail.gif" border="0"
+ alt="E-mail" width="46" height="49"></a></td>
+ </tr>
+ </table>
+ <script language="Javascript" src="http://www.unicode.org/webscripts/lastModified.js"></script>
+ </center>
+ </div>
+</form>
+
+</body>
+
+</html>
--- /dev/null
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+<html>
+
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<meta http-equiv="Content-Language" content="en-us">
+<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
+<meta name="ProgId" content="FrontPage.Editor.Document">
+<meta name="keywords"
+content="unicode, normalization, composition, decomposition">
+<meta name="description" content="Describes PropList.html">
+<title>UCD: Extended Character Properties</title>
+<link rel="stylesheet" type="text/css" href="http://www.unicode.org/unicode.css">
+</head>
+
+<body bgcolor="#ffffff">
+
+<table width="100%" cellpadding="0" cellspacing="0" border="0">
+ <tr>
+ <td>
+ <table width="100%" border="0" cellpadding="0" cellspacing="0">
+ <tr>
+ <td class="icon"><a href="http://www.unicode.org"><img border="0"
+ src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
+ alt="[Unicode]" width="34" height="33"></a> <a
+ class="bar" href="UnicodeCharacterDatabase.html">Unicode Character
+ Database</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="gray"> </td>
+ </tr>
+</table>
+<h1>Extended Character Properties</h1>
+<table height="87" cellspacing="2" cellpadding="0" width="100%" border="1">
+ <tbody>
+ <tr>
+ <td valign="top" width="144">Revision</td>
+ <td valign="top">3.1.0</td>
+ </tr>
+ <tr>
+ <td valign="top" width="144">Authors</td>
+ <td valign="top">Mark Davis</td>
+ </tr>
+ <tr>
+ <td valign="top" width="144">Date</td>
+ <td valign="top">2001-02-28</td>
+ </tr>
+ <tr>
+ <td valign="top" width="144">This Version</td>
+ <td valign="top"><a
+ href="http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.html">http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.html</a></td>
+ </tr>
+ <tr>
+ <td valign="top" width="144">Previous Version</td>
+ <td valign="top">n/a</td>
+ </tr>
+ <tr>
+ <td valign="top" width="144">Latest Version</td>
+ <td valign="top"><a
+ href="http://www.unicode.org/Public/UNIDATA/PropList.html">http://www.unicode.org/Public/UNIDATA/PropList.html</a></td>
+ </tr>
+ </tbody>
+</table>
+<h3><i><br>
+Summary</i></h3>
+<blockquote>
+ <p><i>This document describes the format and content of the PropList.txt data
+ file in the Unicode Character Database (UCD).</i></p>
+</blockquote>
+<h3><i>Status</i></h3>
+<blockquote>
+ <p><i>The file and the files described herein are part of the Unicode
+ Character Database and governed by the <a href="#UCD_Terms">UCD Terms of Use</a>
+ given below.</i></p>
+ <p><i>For general information on file formats and table formats, and the
+ implications of normative vs informative properties, see
+ UnicodeCharacterDatabase.html.</i></p>
+ <p><i><b>Warning: </b>the information in this file does not completely
+ describe the use and interpretation of Unicode character properties and
+ behavior. It must be used in conjunction with the data in the other files in
+ the UCD, and relies on the notation and definitions supplied in <a
+ href="http://www.unicode.org/unicode/standard/versions/Unicode3.0.html">The
+ Unicode Standard</a>. All chapter references are to Version 3.1.0 of the
+ standard.</i></p>
+</blockquote>
+<hr width="50%">
+<h2>Introduction</h2>
+<p align="left">PropList.txt contains extended properties that supplement the
+General Category property described in UnicodeData.html. Unlike the derived
+properties, the properties in PropList.txt cannot be derived directly from
+UnicodeData.txt or other data files of the UCD. These properties are listed in
+the following table.</p>
+<div align="center">
+ <center>
+ <table border="1" cellspacing="0" cellpadding="3" class="smallText">
+ <tr>
+ <th>Property Value</th>
+ <th>N/I</th>
+ <th>Definition and Usage</th>
+ </tr>
+ <tr>
+ <th valign="top">White_space</th>
+ <th valign="top">N</th>
+ <td valign="top">Space characters and those format control characters
+ (such as TAB, CR and LF) which should be treated by programming
+ languages as "white space" for the purpose of parsing
+ elements.
+ <p><b>Note:</b> ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not
+ included, since their functions are restricted to line-break control.
+ Their names are unfortunately misleading in this respect.</p>
+ <p><b>Note: </b>There are other senses of "whitespace" that
+ encompass a different set of characters.</p>
+ </td>
+ </tr>
+ <tr>
+ <th valign="top">Bidi_Control</th>
+ <th valign="top">N</th>
+ <td valign="top">Those format control characters which have specific
+ functions in the Bidirectional Algorithm.</td>
+ </tr>
+ <tr>
+ <th valign="top">Join_Control</th>
+ <th valign="top">N</th>
+ <td valign="top">Those format control characters which have specific
+ functions for control of cursive joining and ligation.</td>
+ </tr>
+ <tr>
+ <th valign="top">Dash</th>
+ <th valign="top">I</th>
+ <td valign="top">Those punctuation characters explicitly called out as
+ dashes in the Unicode Standard, plus compatibility equivalents to those.
+ Most of these have the Pd General Category, but some have the Sm General
+ Category because of their use in mathematics.</td>
+ </tr>
+ <tr>
+ <th valign="top">Hyphen</th>
+ <th valign="top">I</th>
+ <td valign="top">Those dashes used to mark connections between pieces of
+ words, plus the Katakana middle dot. The Katakana middle dot functions
+ like a hyphen, but is shaped like a dot rather than a dash.</td>
+ </tr>
+ <tr>
+ <th valign="top">Quotation_Mark</th>
+ <th valign="top">I</th>
+ <td valign="top">Those punctuation characters that function as quotation
+ marks.</td>
+ </tr>
+ <tr>
+ <th valign="top">Terminal_Punctuation</th>
+ <th valign="top">I</th>
+ <td valign="top">Those punctuation characters that generally mark the end
+ of textual units.</td>
+ </tr>
+ <tr>
+ <th valign="top">Other_Math</th>
+ <th valign="top">I</th>
+ <td valign="top">Math characters that do not have the Sm General Category.</td>
+ </tr>
+ <tr>
+ <th valign="top">Hex_Digit</th>
+ <th valign="top">I</th>
+ <td valign="top">Characters commonly used for the representation of
+ hexadecimal numbers, plus their compatibility equivalents.</td>
+ </tr>
+ <tr>
+ <th valign="top">Other_Alphabetic</th>
+ <th valign="top">I</th>
+ <td valign="top">Alphabetic characters that do not have L as their major
+ class for the General Category (Lu, Ll, Lt, Lm, Lo).</td>
+ </tr>
+ <tr>
+ <th valign="top">Ideographic</th>
+ <th valign="top">I</th>
+ <td valign="top">Characters considered to be CJKV (Chinese, Japanese,
+ Korean, and Vietnamese) ideographs.</td>
+ </tr>
+ <tr>
+ <th valign="top">Diacritic</th>
+ <th valign="top">I</th>
+ <td valign="top">Characters that linguistically modify the meaning of
+ another character to which they apply. Some diacritics are not combining
+ characters, and some combining characters are not diacritics.</td>
+ </tr>
+ <tr>
+ <th valign="top">Extender</th>
+ <th valign="top">I</th>
+ <td valign="top">Characters whose principal function is to extend the
+ value or shape of a preceding alphabetic character. Typical of these are
+ length and iteration marks.</td>
+ </tr>
+ <tr>
+ <th valign="top">Other_Lowercase</th>
+ <th valign="top">I</th>
+ <td valign="top">Lowercase characters that do not have the Ll General
+ Category.</td>
+ </tr>
+ <tr>
+ <th valign="top">Other_Uppercase</th>
+ <th valign="top">I</th>
+ <td valign="top">Uppercase characters that do not have the Lu General
+ Category.</td>
+ </tr>
+ <tr>
+ <th valign="top">Noncharacter_Code_Point</th>
+ <th valign="top">N</th>
+ <td valign="top">Code points that are explicitly defined as illegal for
+ the encoding of characters. See <a
+ href="http://www.unicode.org/unicode/reports/tr27/">Unicode 3.1</a> for
+ more information.</td>
+ </tr>
+ </table>
+ </center>
+</div>
+<h2><i><a name="UCD_Terms"><br>
+UCD Terms of Use</a></i></h2>
+<h3><i>Disclaimer</i></h3>
+<blockquote>
+ <p><i>The Unicode Character Database is provided as is by Unicode, Inc. No
+ claims are made as to fitness for any particular purpose. No warranties of any
+ kind are expressed or implied. The recipient agrees to determine applicability
+ of information provided. If this file has been purchased on magnetic or
+ optical media from Unicode, Inc., the sole remedy for any claim will be
+ exchange of defective media within 90 days of receipt.</i></p>
+ <p><i>This disclaimer is applicable for all other data files accompanying the
+ Unicode Character Database, some of which have been compiled by the Unicode
+ Consortium, and some of which have been supplied by other sources.</i></p>
+</blockquote>
+<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
+<blockquote>
+ <p><i>Recipient is granted the right to make copies in any form for internal
+ distribution and to freely use the information supplied in the creation of
+ products supporting the Unicode<sup>TM</sup> Standard. The files in the
+ Unicode Character Database can be redistributed to third parties or other
+ organizations (whether for profit or not) as long as this notice and the
+ disclaimer notice are retained. Information can be extracted from these files
+ and used in documentation or programs, as long as there is an accompanying
+ notice indicating the source.</i></p>
+</blockquote>
+<hr width="50%">
+<p align="center"><a href="http://www.unicode.org/unicode/copyright.html"><img
+src="http://www.unicode.org/img/hb_home.gif" border="0" alt="Home" width="40"
+height="49"><img src="http://www.unicode.org/img/hb_mid.gif" border="0"
+alt="Terms of Use" width="152" height="49"><img
+src="http://www.unicode.org/img/hb_mail.gif" border="0" alt="E-mail" width="46"
+height="49"></a>
+
+</body>
+
+</html>
<body>
-<h1>UNICODE CHARACTER DATABASE<br>
-Version 3.0.1</h1>
+<table width="100%" cellpadding="0" cellspacing="0" border="0">
+ <tr>
+ <td>
+ <table width="100%" border="0" cellpadding="0" cellspacing="0">
+ <tr>
+ <td class="icon"><a href="http://www.unicode.org"><img border="0"
+ src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
+ alt="[Unicode]" width="34" height="33"></a> <a
+ class="bar" href="UnicodeCharacterDatabase.html">Unicode Character
+ Database</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="gray"> </td>
+ </tr>
+</table>
+<h1>UNICODE CHARACTER DATABASE</h1>
<table border="1" cellspacing="2" cellpadding="0" height="87" width="100%">
<tr>
<td valign="TOP" width="144">Revision</td>
- <td valign="TOP">3.0.1</td>
+ <td valign="TOP">3.1.0</td>
</tr>
<tr>
<td valign="TOP" width="144">Authors</td>
</tr>
<tr>
<td valign="TOP" width="144">Date</td>
- <td valign="TOP">2000-08-17</td>
+ <td valign="TOP">2001-02-28</td>
</tr>
<tr>
<td valign="TOP" width="144">This Version</td>
<td valign="TOP"><a
- href="http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html">http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html</a></td>
+ href="http://http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html">http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html</a></td>
</tr>
<tr>
<td valign="TOP" width="144">Previous Version</td>
<td valign="TOP"><a
- href="http://www.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html">http://www.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html</a></td>
+ href="http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html">http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html</a></td>
</tr>
<tr>
<td valign="TOP" width="144">Latest Version</td>
href="http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html">http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html</a></td>
</tr>
</table>
-<p align="center">Copyright © 1995-2000 Unicode, Inc. All Rights reserved.</p>
-<h2>Disclaimer</h2>
-<p>The Unicode Character Database is provided as is by Unicode, Inc. No claims
-are made as to fitness for any particular purpose. No warranties of any kind are
-expressed or implied. The recipient agrees to determine applicability of
-information provided. If this file has been purchased on magnetic or optical
-media from Unicode, Inc., the sole remedy for any claim will be exchange of
-defective media within 90 days of receipt.</p>
-<p>This disclaimer is applicable for all other data files accompanying the
-Unicode Character Database, some of which have been compiled by the Unicode
-Consortium, and some of which have been supplied by other sources.</p>
-<h2>Limitations on Rights to Redistribute This Data</h2>
-<p>Recipient is granted the right to make copies in any form for internal
-distribution and to freely use the information supplied in the creation of
-products supporting the Unicode<sup>TM</sup> Standard. The files in the Unicode
-Character Database can be redistributed to third parties or other organizations
-(whether for profit or not) as long as this notice and the disclaimer notice are
-retained. Information can be extracted from these files and used in
-documentation or programs, as long as there is an accompanying notice indicating
-the source.</p>
+<h3><br>
+S<i>ummary</i></h3>
+<blockquote>
+ <p><i>This document describes the format and content of the Unicode Character
+ Database (UCD)</i></p>
+</blockquote>
+<h3><i>Status</i></h3>
+<blockquote>
+ <p><i>The file and the files described herein are part of the Unicode
+ Character Database and are governed by the <a href="#UCD_Terms">UCD Terms of
+ Use</a> given below.</i></p>
+ <p><i>The <a href="#References">References</a> provide related information
+ that is useful in understanding this document.</i></p>
+ <p><i><b>Warning: </b>the information in this file does not completely
+ describe the use and interpretation of Unicode character properties and
+ behavior. It must be used in conjunction with the data in the other files in
+ the Unicode Character Database, and relies on the notation and definitions
+ supplied in <a
+ href="http://www.unicode.org/unicode/standard/versions/Unicode3.0.html">The
+ Unicode Standard</a>. All chapter references are to Version 3.1.0 of the
+ standard.</i></p>
+</blockquote>
<h2>Introduction</h2>
-<p>The Unicode Character Database is a set of files that define the Unicode
-character properties and internal mappings. For more information about character
-properties and mappings, see <i><a
-href="http://www.unicode.org/unicode/uni2book/u2.html">The Unicode Standard</a></i>.</p>
-<p>The Unicode Character Database has been updated to reflect Version 3.0 of the
-Unicode Standard, with many characters added to those published in Version 2.0.
-A number of corrections have also been made to case mappings or other errors in
-the database noted since the publication of Version 2.0. Normative bidirectional
-properties have also been modified to reflect decisions of the Unicode Technical
-Committee.</p>
-<p>For more information on versions of the Unicode Standard and how to reference
-them, see <a href="http://www.unicode.org/unicode/standard/versions/">http://www.unicode.org/unicode/standard/versions/</a>.</p>
-<h2>Conformance</h2>
-<p>Character properties may be either normative or informative. <i>Normative</i>
-means that implementations that claim conformance to the Unicode Standard (at a
-particular version) and which make use of a particular property or field must
-follow the specifications of the standard for that property or field in order to
-be conformant. The term <i>normative</i> when applied to a property or field of
-the Unicode Character Database, does <i>not</i> mean that the value of that
-field will never change. Corrections and extensions to the standard in the
-future may require minor changes to normative values, even though the Unicode
-Technical Committee strives to minimize such changes. An<i> informative </i>property
-or field is strongly recommended, but a conformant implementation is free to use
-or change such values as it may require while still being conformant to the
-standard. Particular implementations may choose to override the properties and
-mappings that are not normative. In that case, it is up to the implementer to
-establish a protocol to convey that information.</p>
-<h2>Files</h2>
-<p>The following summarizes the files in the Unicode Character Database. For
-more information about these files, see the referenced technical report(s) or
-section of Unicode Standard, Version 3.0.</p>
-<p><b>UnicodeData.txt (Chapter 4, <a
-href="http://www.unicode.org/unicode/reports/tr21/">UTR #21: Case Mappings</a>, <a
-href="http://www.unicode.org/unicode/reports/tr15/">UAX #15 Unicode Normalization
-Forms</a>)</b>
-<ul>
- <li>The main file in the Unicode Character Database.</li>
- <li>For detailed information on the format, see <a href="UnicodeData.html">UnicodeData.html</a>.
- This file also characterizes which properties are normative and which are
- informative.</li>
-</ul>
-<p><b>PropList.txt (Chapter 4)</b>
-<ul>
- <li>Additional informative properties list: <i>Alphabetic, Ideographic,</i>
- and <i>Mathematical</i>, among others.</li>
-</ul>
-<p><b>SpecialCasing.txt (Chapter 4, <a
-href="http://www.unicode.org/unicode/reports/tr21/">UTR #21: Case Mappings</a>)</b>
-<ul>
- <li>List of informative special casing properties, including one-to-many
- mappings such as SHARP S => "SS", and locale-specific mappings,
- such as for Turkish <i>dotless i</i>.</li>
-</ul>
-<p><b>Blocks.txt (Chapter 14)</b>
-<ul>
- <li>List of normative block names.</li>
-</ul>
-<p><b>Jamo.txt (Chapter 4)</b>
-<ul>
- <li>List of normative Jamo short names, used in deriving HANGUL SYLLABLE names
- algorithmically.</li>
-</ul>
-<p><b>ArabicShaping.txt (Section 8.2)</b>
-<ul>
- <li>Basic Arabic and Syriac character shaping properties, such as initial,
- medial and final shapes. These properties are normative for minimal shaping
- of Arabic and Syriac.</li>
-</ul>
-<p><b>NamesList.txt (Chapter 14)</b>
-<ul>
- <li>This file duplicates some of the material in the UnicodeData file, and
- adds informative annotations uses in the character charts, as printed in the
- Unicode Standard.</li>
- <li><b>Note: </b>The information in NamesList.txt and Index.txt files matches
- the appropriate version of the book. Changes in the Unicode Character
- Database since then may not be reflected in these files, since they are
- primarily of archival interest.</li>
-</ul>
-<p><b>Index.txt (Chapter 14)</b>
-<ul>
- <li>Informative index to Unicode characters, as printed in the Unicode
- Standard</li>
- <li><b>Note: </b>The information in NamesList.txt and Index.txt files matches
- the appropriate version of the book. Changes in the Unicode Character
- Database since then may not be reflected in these files, since they are
- primarily of archival interest.</li>
-</ul>
-<p><b>CompositionExclusions.txt (<a
-href="http://www.unicode.org/unicode/reports/tr15/">UAX #15 Unicode Normalization
-Forms</a>)</b>
-<ul>
- <li>Normative properties for normalization.</li>
-</ul>
-<p><b>LineBreak.txt (<a href="http://www.unicode.org/unicode/reports/tr14/">UAX
-#14: Line Breaking Properties</a>)</b>
+<p>The Unicode Character Database (UCD) is a set of files that define the
+Unicode character properties and internal mappings. This document describes the
+files that are part of <a href="http://www.unicode.org/unicode/reports/tr27/">The
+Unicode Standard, Version 3.1</a> [<a href="#U3.1">U3.1</a>]. The main changes
+in this version are:</p>
<ul>
- <li>Normative and informative properties for line breaking. To see which
- properties are informative and which are normative, consult UAX #14.</li>
+ <li>All of the data files have been updated to account for the large number of
+ additional characters in Unicode 3.1.</li>
+ <li>PropList.txt has been extensively reorganized and reformatted.</li>
+ <li>Scripts.txt has been added to the UCD.</li>
+ <li>A large number of informative derived property files have been added to
+ the UCD.</li>
</ul>
-<p><b>EastAsianWidth.txt (<a href="http://www.unicode.org/unicode/reports/tr11/">UAX
-#11: East Asian Character Width</a>)</b>
+<p><i>Files in the UCD use a common format unless otherwise specified. For
+details, see <a href="#UCD_File_Format">UCD File Format</a>.</i></p>
+<h2><a name="Conformance">Conformance</a></h2>
+<p>For information on the meaning and application of the terms normative and
+informative, see "Chapter 4, Character Properties (revision)" in <a
+href="http://www.unicode.org/unicode/reports/tr27/#conformance">UAX #27, Unicode
+3.1</a>.</p>
+<p>Some informative data files contain derived properties, properties that can
+be derived from other properties in the UCD. The derived properties that are
+computed from solely normative properties are themselves normative, while the
+others are informative.</p>
+<h2>UCD Files</h2>
+<p>The following table summarizes the files in the Unicode Character Database.
+ For more information about these files, see the referenced technical
+report(s), files, or section of Unicode Standard, Version 3.1.</p>
+<table border="1" cellspacing="0" cellpadding="4">
+ <tr>
+ <th>".txt" File</th>
+ <th>Description</th>
+ <th align="center">N/I</th>
+ <th>Summary</th>
+ </tr>
+ <tr>
+ <td>ArabicShaping</td>
+ <td>Section 8.2</td>
+ <td align="center">N</td>
+ <td>Basic Arabic and Syriac character shaping properties, such as initial,
+ medial and final shapes.</td>
+ </tr>
+ <tr>
+ <td>BidiMirroring</td>
+ <td><a href="http://www.unicode.org/unicode/reports/tr9/">UAX #9</a></td>
+ <td align="center">I</td>
+ <td>Properties for substituting characters in an implementation of
+ bidirectional mirroring.</td>
+ </tr>
+ <tr>
+ <td>Blocks</td>
+ <td>Chapter 14</td>
+ <td align="center">N</td>
+ <td>List of block names.</td>
+ </tr>
+ <tr>
+ <td>CaseFolding</td>
+ <td><a href="http://www.unicode.org/unicode/reports/tr21/">UTR #21</a></td>
+ <td align="center">N</td>
+ <td>Mapping from characters to their case-folded forms. This is an
+ informative file containing normative derived properties.
+ <p><i>Derived from UnicodeData and SpecialCasing.</i></p>
+ </td>
+ </tr>
+ <tr>
+ <td>CompositionExclusions</td>
+ <td><a href="http://www.unicode.org/unicode/reports/tr15/">UAX #15</a></td>
+ <td align="center">N</td>
+ <td>Properties for normalization.</td>
+ </tr>
+ <tr>
+ <td><i>DerivedXXX</i></td>
+ <td>DerivedProperties.html</td>
+ <td align="center">N/I</td>
+ <td>Various informative derived files, described in the documentation file.
+ Some of the derived properties are normative and some are informative.</td>
+ </tr>
+ <tr>
+ <td>EastAsianWidth</td>
+ <td><a href="http://www.unicode.org/unicode/reports/tr11/">UAX #11</a></td>
+ <td align="center">I</td>
+ <td>Properties for determining the choice of wide vs. narrow glyphs in East
+ Asian contexts.</td>
+ </tr>
+ <tr>
+ <td>Index</td>
+ <td>Chapter 14</td>
+ <td align="center">I</td>
+ <td>Index to Unicode characters, as printed in the Unicode Standard. (See <a
+ href="#Update_Note">Update Note</a>.)</td>
+ </tr>
+ <tr>
+ <td>Jamo</td>
+ <td>Chapter 4</td>
+ <td align="center">N</td>
+ <td>List of Jamo short names, used in deriving HANGUL SYLLABLE names
+ algorithmically.</td>
+ </tr>
+ <tr>
+ <td>LineBreak</td>
+ <td><a href="http://www.unicode.org/unicode/reports/tr14/">UAX #14</a></td>
+ <td align="center">N/I</td>
+ <td>Properties for line breaking.</td>
+ </tr>
+ <tr>
+ <td>NamesList</td>
+ <td>Chapter 14</td>
+ <td align="center">I</td>
+ <td>This file duplicates some of the material in the UnicodeData file, and
+ adds annotations used in the character charts.</td>
+ </tr>
+ <tr>
+ <td>NormalizationTest</td>
+ <td><a href="http://www.unicode.org/unicode/reports/tr15/">UAX #15</a></td>
+ <td align="center">N</td>
+ <td>Test file for conformance to Unicode Normalization Forms.</td>
+ </tr>
+ <tr>
+ <td>PropList</td>
+ <td>PropList.html</td>
+ <td align="center">N/I</td>
+ <td>Extended character properties</td>
+ </tr>
+ <tr>
+ <td>Scripts</td>
+ <td><a href="http://www.unicode.org/unicode/reports/tr24/">UTR #24</a></td>
+ <td align="center">I</td>
+ <td>Default scripts values for use in regular expressions.</td>
+ </tr>
+ <tr>
+ <td>SpecialCasing</td>
+ <td>Chapter 4,<br>
+ <a href="http://www.unicode.org/unicode/reports/tr21/">UTR #21</a></td>
+ <td align="center">N</td>
+ <td>List of properties required for full case mapping.</td>
+ </tr>
+ <tr>
+ <td>UnicodeData</td>
+ <td>UnicodeData.html,<br>
+ Chapter 4,<br>
+ <a href="http://www.unicode.org/unicode/reports/tr21/">UTR #21</a>,<br>
+ <a href="http://www.unicode.org/unicode/reports/tr15/">UAX #15</a></td>
+ <td align="center">N/I</td>
+ <td>The main file in the UCD. </td>
+ </tr>
+ <tr>
+ <td>Unihan</td>
+ <td>Unihan.txt</td>
+ <td align="center">N/I</td>
+ <td>Extended properties of Han (CJK) characters. (See <a href="#Format_Note">Format
+ Note</a>.)</td>
+ </tr>
+</table>
+<blockquote>
+ <p><b><a name="Update_Note">Update Note</a>: </b>The information in Index.txt
+ files matches the appropriate version of the book. Changes in the Unicode
+ Character Database since then may not be reflected in these files, since they
+ are primarily of archival interest.</p>
+ <p><b><a name="Format_Note">Format Note</a>: </b>The file data format differs
+ from the standard format, and is described in the header of the file. The
+ header also describes which properties are informative and which are
+ normative.</p>
+</blockquote>
+<h2><a name="UCD_File_Format">UCD File Format</a></h2>
+<p>Files in the UCD use the following format, unless otherwise specified.</p>
<ul>
- <li>Informative properties for determining the choice of wide vs. narrow
- glyphs in East Asian contexts.</li>
+ <li>Each line of data consists of fields separated by semicolons. The fields
+ are numbered starting with zero. Code points are expressed as hexadecimal
+ numbers with four to six digits. They are written without "U+".
+ Within a sequence of code points, spaces are used for separation. Leading
+ and trailing spaces within a field are not significant.</li>
</ul>
-<p><b>BidiMirroring.txt</b><b> (<a
-href="http://www.unicode.org/unicode/reports/tr9/">UAX #9: The
-Bidirectional Algorithm</a>)</b></p>
<ul>
- <li>Informative properties for substituting characters in an implementation of
- bidirectional mirroring.</li>
+ <li>The first field (0) of each line in the Unicode Character Database files
+ represents a code point or range. The remaining fields (1..n) are properties
+ associated with that code point.</li>
</ul>
-<p><b>CaseFolding.txt (<a href="http://www.unicode.org/unicode/reports/tr21/">UTR
-#21: Case Mappings</a>)</b></p>
<ul>
- <li>Informative file mapping characters to their case-folded form.</li>
+ <li>A range of code points is specified by the form "X..Y". Each
+ code point from X to Y has the associated properties. For example:</li>
</ul>
-<p><b>NormalizationTest.txt (<a
-href="http://www.unicode.org/unicode/reports/tr15/">UAX #15 Unicode Normalization
-Forms</a>)</b></p>
+<blockquote>
+ <pre>0000..007F; Basic Latin
+0080..00FF; Latin-1 Supplement
+
+1680 ; White_space # Zs OGHAM SPACE MARK
+2000..200A; White_space # Zs [11] EN QUAD..HAIR SPACE</pre>
+</blockquote>
<ul>
- <li>Normative test file for conformance to Unicode Normalization Forms.</li>
+ <li>Hash marks ("#") are used to indicate comments: all characters
+ from the hash mark to the end of the line are comments, and disregarded when
+ parsing data. In many files, the comments on data lines use a common format.</li>
</ul>
-<p><b>diffXvY.txt</b>
+<blockquote>
+ <pre>00BC..00BE ; numeric # No [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS</pre>
+</blockquote>
<ul>
- <li>Mechanically-generated informative files containing accumulated
- differences between successive versions of UnicodeData.txt</li>
+ <li>The first part of the comment is the UCD general category. The symbol
+ "L&" indicates characters of type Lu, Ll, or Lt. The code
+ point ranges are calculated so that they all have the same General Category
+ (or L&). While this results in more ranges than are strictly necessary,
+ it makes the contents of the ranges clearer. The second part of the comment
+ (in square brackets), indicates the number of items in a range, if there is
+ one. The third part is the name of the character in field zero: if it is a
+ range, then the character names for the ends of the range are separated by
+ "..".</li>
</ul>
+<p>However, the comments are purely informational, and may change format or be
+omitted in the future. They should not be parsed for content.</p>
+<h2><a name="References">References</a></h2>
+<table cellspacing="12" cellpadding="0" width="100%" border="0">
+ <tbody>
+ <tr>
+ <td valign="top" width="1">[<a name="FAQ">FAQ</a>]</td>
+ <td valign="top">Unicode Frequently Asked Questions<br>
+ <a href="http://www.unicode.org/unicode/faq/">http://www.unicode.org/unicode/faq/<br>
+ </a><i>For answers to common questions on technical issues.</i></td>
+ </tr>
+ <tr>
+ <td valign="top" width="1">[<a name="Glossary">Glossary</a>]</td>
+ <td valign="top">Unicode Glossary<a
+ href="http://www.unicode.org/glossary/"><br>
+ http://www.unicode.org/glossary/<br>
+ </a><i>For explanations of terminology used in this and other documents.</i></td>
+ </tr>
+ <tr>
+ <td valign="top" width="1">[<a name="Reports">Reports</a>]</td>
+ <td valign="top">Unicode Technical Reports<br>
+ <a href="http://www.unicode.org/unicode/reports/">http://www.unicode.org/unicode/reports/<br>
+ </a><i>For information on the status and development process for
+ technical reports, and for a list of technical reports.</i></td>
+ </tr>
+ <tr>
+ <td valign="top" width="1">[<a name="U3.1">U3.1</a>]</td>
+ <td valign="top">Unicode Standard Annex #27: Unicode 3.1<a
+ href="http://www.unicode.org/unicode/reports/tr27/"><br>
+ http://www.unicode.org/unicode/reports/tr27/</a></td>
+ </tr>
+ <tr>
+ <td valign="top" width="1">[<a name="Versions">Versions</a>]</td>
+ <td valign="top">Versions of the Unicode Standard<br>
+ <a href="http://www.unicode.org/unicode/standard/versions/">http://www.unicode.org/unicode/standard/versions/<br>
+ </a><i>For details on the precise contents of each version of the
+ Unicode Standard, and how to cite them.</i></td>
+ </tr>
+ </tbody>
+</table>
+<h2><br>
+<i><a name="UCD_Terms">UCD Terms of Use</a></i></h2>
+<h3><i>Disclaimer</i></h3>
+<blockquote>
+ <p><i>The Unicode Character Database is provided as is by Unicode, Inc. No
+ claims are made as to fitness for any particular purpose. No warranties of any
+ kind are expressed or implied. The recipient agrees to determine applicability
+ of information provided. If this file has been purchased on magnetic or
+ optical media from Unicode, Inc., the sole remedy for any claim will be
+ exchange of defective media within 90 days of receipt.</i></p>
+ <p><i>This disclaimer is applicable for all other data files accompanying the
+ Unicode Character Database, some of which have been compiled by the Unicode
+ Consortium, and some of which have been supplied by other sources.</i></p>
+</blockquote>
+<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
+<blockquote>
+ <p><i>Recipient is granted the right to make copies in any form for internal
+ distribution and to freely use the information supplied in the creation of
+ products supporting the Unicode<sup>TM</sup> Standard. The files in the
+ Unicode Character Database can be redistributed to third parties or other
+ organizations (whether for profit or not) as long as this notice and the
+ disclaimer notice are retained. Information can be extracted from these files
+ and used in documentation or programs, as long as there is an accompanying
+ notice indicating the source.</i></p>
+</blockquote>
+<hr width="50%">
+<p align="center"><a href="http://www.unicode.org/unicode/copyright.html"><img
+src="http://www.unicode.org/img/hb_home.gif" border="0" alt="Home" width="40"
+height="49"><img src="http://www.unicode.org/img/hb_mid.gif" border="0"
+alt="Terms of Use" width="152" height="49"><img
+src="http://www.unicode.org/img/hb_mail.gif" border="0" alt="E-mail" width="46"
+height="49"></a>
</body>
<body>
-<h1>UnicodeData File Format<br>
-Version 3.0.1</h1>
+<table width="100%" cellpadding="0" cellspacing="0" border="0">
+ <tr>
+ <td>
+ <table width="100%" border="0" cellpadding="0" cellspacing="0">
+ <tr>
+ <td class="icon"><a href="http://www.unicode.org"><img border="0"
+ src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle"
+ alt="[Unicode]" width="34" height="33"></a> <a
+ class="bar" href="UnicodeCharacterDatabase.html">Unicode Character
+ Database</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td class="gray"> </td>
+ </tr>
+</table>
+<h1>Unicode Data File Format</h1>
<table border="1" cellspacing="2" cellpadding="0" height="87" width="100%">
<tr>
<td valign="TOP" width="144">Revision</td>
- <td valign="TOP">3.0.1</td>
+ <td valign="TOP">3.1.0</td>
</tr>
<tr>
<td valign="TOP" width="144">Authors</td>
</tr>
<tr>
<td valign="TOP" width="144">Date</td>
- <td valign="TOP">2000-08-17</td>
+ <td valign="TOP">2001-02-28</td>
</tr>
<tr>
<td valign="TOP" width="144">This Version</td>
<td valign="TOP"><a
- href="http://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.html">http://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.html</a></td>
+ href="http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.html">http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.html</a></td>
</tr>
<tr>
<td valign="TOP" width="144">Previous Version</td>
<td valign="TOP"><a
- href="http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html">http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html</a></td>
+ href="http://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.html">http://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.html</a></td>
</tr>
<tr>
<td valign="TOP" width="144">Latest Version</td>
href="http://www.unicode.org/Public/UNIDATA/UnicodeData.html">http://www.unicode.org/Public/UNIDATA/UnicodeData.html</a></td>
</tr>
</table>
-<p align="center">Copyright © 1995-2000 Unicode, Inc. All Rights reserved.<br>
-<i>For more information, including Disclamer and Limitations, see <a
-href="UnicodeCharacterDatabase-3.0.1.html">UnicodeCharacterDatabase-3.0.1.html</a></i></p>
+<h3><br>
+S<i>ummary</i></h3>
+<blockquote>
+ <p><i>This document describes the format and content of the UnicodeData.txt
+ file in the Unicode Character Database (UCD).</i></p>
+</blockquote>
+<h3><i>Status</i></h3>
+<blockquote>
+ <p><i>The file and the files described herein are part of the Unicode
+ Character Database and governed by the <a href="#UCD_Terms">UCD Terms of
+ Use</a> given below.</i></p>
+ <p><i>For general information on file formats and table formats, and the
+ implications of normative vs informative properties, see
+ UnicodeCharacterDatabase.html. </i></p>
+ <p><i><b>Warning: </b>the information in this file does not completely
+ describe the use and interpretation of Unicode character properties and
+ behavior. It must be used in conjunction with the data in the other files in
+ the UCD, and relies on the notation and definitions supplied in <a
+ href="http://www.unicode.org/unicode/standard/versions/Unicode3.0.html">The
+ Unicode Standard</a>. All chapter references are to Version 3.1.0 of the
+ standard.</i></p>
+</blockquote>
+<h2>Introduction</h2>
<p>This document describes the format of the UnicodeData.txt file, which is one
of the files in the Unicode Character Database. The document is divided into the
following sections:
<li><a href="#Property Invariants">Property Invariants</a></li>
<li><a href="#Modification History">Modification History</a></li>
</ul>
-<p><b>Warning: </b>the information in this file does not completely describe the
-use and interpretation of Unicode character properties and behavior. It must be
-used in conjunction with the data in the other files in the <a
-href="UnicodeCharacterDatabase-3.0.1.html">Unicode Character Database</a>, and
-relies on the notation and definitions supplied in <i><a
-href="http://www.unicode.org/unicode/standard/versions/Unicode3.0.html">The
-Unicode Standard</a></i>. All chapter references are to Version 3.0 of the
-standard.</p>
<h2><a name="Field Formats"></a>Field Formats</h2>
-<p>The file consists of lines containing fields separated by semicolons. Each
-line represents the data for one encoded character in the Unicode Standard.
-Every encoded character has a data entry, with the exception of certain special
-ranges, as detailed below.
+<p>Each line represents the data for one encoded character in the Unicode
+Standard. (For information on the file format, see UCD File Format in
+UnicodeCharacterDatabase.html).
+<p>Every encoded character has a data entry, with the exception of certain
+special ranges, as detailed below.
<ul>
- <li>There are nine special ranges of characters that are represented only by
- their start and end characters, since the properties in the file are
- uniform, except for code values (which are all sequential and assigned).</li>
+ <li>These ranges represented only by their start and end characters, since the
+ properties in the file are uniform, except for code values (which are all
+ sequential and assigned).</li>
<li>The names of CJK ideograph characters and the names and decompositions of
Hangul syllable characters are algorithmically derivable. (See the Unicode
Standard and <a href="http://www.unicode.org/unicode/reports/tr15/">Unicode
Standard Annex #15</a> for more information).</li>
<li>Surrogate code values and private use characters have no names.</li>
- <li>The Private Use character outside of the BMP (U+F0000..U+FFFFD,
- U+100000..U+10FFFD) are listed as distinct ranges. These correspond to surrogate pairs
+ <li>The supplementary Private Use characters (U+F0000 .. U+FFFFD, U+100000 ..
+ U+10FFFD) are listed as distinct ranges. These correspond to surrogate pairs
where the first surrogate is in the High Surrogate Private Use section.</li>
</ul>
<p>The exact ranges represented by start and end characters are:
<ul>
- <li>CJK Ideographs Extension A (U+3400 - U+4DB5)</li>
- <li>CJK Ideographs (U+4E00 - U+9FA5)</li>
- <li>Hangul Syllables (U+AC00 - U+D7A3)</li>
- <li>Non-Private Use High Surrogates (U+D800 - U+DB7F)</li>
- <li>Private Use High Surrogates (U+DB80 - U+DBFF)</li>
- <li>Low Surrogates (U+DC00 - U+DFFF)</li>
- <li>The Private Use Area (U+E000 - U+F8FF)</li>
- <li>Plane 15 Private Use Area (U+F0000 - U+FFFFD)</li>
- <li>Plane 16 Private Use Area (U+100000 - U+10FFFD)</li>
+ <li>CJK Ideographs Extension A (U+3400 .. U+4DB5)</li>
+ <li>CJK Ideographs (U+4E00 .. U+9FA5)</li>
+ <li>Hangul Syllables (U+AC00 .. U+D7A3)</li>
+ <li>Non-Private Use High Surrogates (U+D800 .. U+DB7F)</li>
+ <li>Private Use High Surrogates (U+DB80 .. U+DBFF)</li>
+ <li>Low Surrogates (U+DC00 .. U+DFFF)</li>
+ <li>The Private Use Area (U+E000 .. U+F8FF)</li>
+ <li>CJK Ideographs Extension B (U+20000 .. U+2A6D6)</li>
+ <li>Plane 15 Private Use Area (U+F0000 .. U+FFFFD)</li>
+ <li>Plane 16 Private Use Area (U+100000 .. U+10FFFD)</li>
</ul>
<p>The following table describes the format and meaning of each field in a data
-entry in the UnicodeData file. Fields which contain normative information are so
-indicated.</p>
+entry in the UnicodeData file.</p>
<table border="1" cellspacing="2" cellpadding="2">
<tr>
<th valign="top" align="LEFT">
<p align="LEFT">Field</th>
<th valign="top" align="LEFT">
<p align="LEFT">Name</th>
- <th valign="top" align="LEFT">
- <p align="LEFT">Status</th>
+ <th valign="top" align="center">
+ <p align="LEFT">N/I</th>
<th valign="top" align="LEFT">
<p align="LEFT">Explanation</th>
</tr>
<tr>
<th valign="top">0</th>
<td valign="top">Code value</td>
- <td valign="top">normative</td>
- <td valign="top">Code value. For characters in the range U+0000..U+FFFD
- the code value uses a 4-digit hexadecimal format; for characters in the
- range U+10000..U+FFFFD the code value uses a 5-digit hexadecimal format;
- and for characters in the range U+100000..U+10FFFD the code value uses a
- 6-digit hexadecimal format.</td>
+ <td valign="top" align="center">N</td>
+ <td valign="top">Code value.</td>
</tr>
<tr>
<th valign="top">1</th>
<td valign="top">Character name</td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">These names match exactly the names published in Chapter 14
of the Unicode Standard, Version 3.0.</td>
</tr>
<tr>
<th valign="top">2</th>
<td valign="top"><a href="#General Category">General Category</a></td>
- <td valign="top">normative / informative<br>
- (see below)</td>
+ <td valign="top" align="center">N</td>
<td valign="top">This is a useful breakdown into various "character
types" which can be used as a default categorization in
implementations. See below for a brief explanation.</td>
<th valign="top">3</th>
<td valign="top"><a href="#Canonical Combining Classes">Canonical Combining
Classes</a></td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">The classes used for the Canonical Ordering Algorithm in
the Unicode Standard. These classes are also printed in Chapter 4 of the
Unicode Standard.</td>
<tr>
<th valign="top">4</th>
<td valign="top"><a href="#Bidirectional Category">Bidirectional Category</a></td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">See the list below for an explanation of the abbreviations
used in this field. These are the categories required by the Bidirectional
Behavior Algorithm in the Unicode Standard. These categories are
<th valign="top">5</th>
<td valign="top"><a href="#Character Decomposition">Character Decomposition
Mapping</a></td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">In the Unicode Standard, not all of the mappings are full
(maximal) decompositions. Recursive application of look-up for
decompositions will, in all cases, lead to a maximal decomposition. The
<tr>
<th valign="top">6</th>
<td valign="top">Decimal digit value</td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">This is a numeric field. If the character has the decimal
digit property, as specified in Chapter 4 of the Unicode Standard, the
value of that digit is represented with an integer value in this field</td>
<tr>
<th valign="top">7</th>
<td valign="top">Digit value</td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">This is a numeric field. If the character represents a
digit, not necessarily a decimal digit, the value is here. This covers
digits which do not form decimal radix forms, such as the compatibility
<tr>
<th valign="top">8</th>
<td valign="top">Numeric value</td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">This is a numeric field. If the character has the numeric
property, as specified in Chapter 4 of the Unicode Standard, the value of
that character is represented with an integer or rational number in this
<tr>
<th valign="top">9</th>
<td valign="top">Mirrored</td>
- <td valign="top">normative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">If the character has been identified as a
"mirrored" character in bidirectional text, this field has the
value "Y"; otherwise "N". The list of mirrored
<tr>
<th valign="top">10</th>
<td valign="top">Unicode 1.0 Name</td>
- <td valign="top">informative</td>
+ <td valign="top" align="center">I</td>
<td valign="top">This is the old name as published in Unicode 1.0. This name
- is only provided when it is significantly different from the Unicode 3.0
- name for the character.</td>
+ is only provided when it is significantly different from the current name
+ for the character.</td>
</tr>
<tr>
<th valign="top">11</th>
<td valign="top">10646 comment field</td>
- <td valign="top">informative</td>
- <td valign="top">This is the ISO 10646 comment field. It appears in parentheses
- in the 10646 names list, or contains an asterisk to mark an Annex P note.</td>
+ <td valign="top" align="center">I</td>
+ <td valign="top">This is the ISO 10646 comment field. It appears in
+ parentheses in the 10646 names list, or contains an asterisk to mark an
+ Annex P note.</td>
</tr>
<tr>
<th valign="top">12</th>
<td valign="top"><a href="#Case Mappings">Uppercase Mapping</a></td>
- <td valign="top">informative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">Upper case equivalent mapping. If a character is part of an
- alphabet with case distinctions, and has an upper case equivalent, then
- the upper case equivalent is in this field. See the explanation below on
- case distinctions. These mappings are always one-to-one, not one-to-many
- or many-to-one. This field is informative.</td>
+ alphabet with case distinctions, and has a simple upper case equivalent,
+ then the upper case equivalent is in this field. See the explanation below
+ on case distinctions. These mappings are always one-to-one, not
+ one-to-many or many-to-one.
+ <p><i>For full case mappings, see <a
+ href="http://www.unicode.org/unicode/reports/tr21/">UTR #21</a> and
+ SpecialCasing.txt.</i></p>
+ </td>
</tr>
<tr>
<th valign="top">13</th>
<td valign="top"><a href="#Case Mappings">Lowercase Mapping</a></td>
- <td valign="top">informative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">Similar to Uppercase mapping</td>
</tr>
<tr>
<th valign="top">14</th>
<td valign="top"><a href="#Case Mappings">Titlecase Mapping</a></td>
- <td valign="top">informative</td>
+ <td valign="top" align="center">N</td>
<td valign="top">Similar to Uppercase mapping</td>
</tr>
</table>
<h3><a name="General Category"></a>General Category</h3>
-<p>The values in this field are abbreviations for the following. Some of the
-values are normative, and some are informative. For more information, see the
-Unicode Standard.</p>
-<p><b>Note:</b> the standard does not assign information to control characters
-(except for certain cases in the Bidirectional Algorithm). Implementations will
-generally also assign categories to certain control characters, notably CR and
-LF, according to platform conventions.</p>
-<h4>Normative Categories</h4>
-<table border="0" cellspacing="2" cellpadding="0">
+<p>The values in this field are abbreviations for the following values. For more
+information, see the Unicode Standard.</p>
+<blockquote>
+ <p><b>Note:</b> the standard does not assign information to control characters
+ (except for certain cases in the Bidirectional Algorithm). Implementations
+ will generally also assign categories to certain control characters, notably
+ CR and LF, according to platform conventions. See <a
+ href="http://www.unicode.org/unicode/reports/tr13/">UAX #13: Unicode Newline
+ Guidelines</a> for more information.</p>
+</blockquote>
+<table border="0" cellspacing="0" cellpadding="4">
<tr>
<th>
<p align="LEFT">Abbr.</th>
<td>Letter, Titlecase</td>
</tr>
<tr>
+ <td align="CENTER">Lm</td>
+ <td>Letter, Modifier</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Lo</td>
+ <td>Letter, Other</td>
+ </tr>
+ <tr>
<td align="CENTER">Mn</td>
<td>Mark, Non-Spacing</td>
</tr>
<td>Number, Other</td>
</tr>
<tr>
- <td align="CENTER">Zs</td>
- <td>Separator, Space</td>
- </tr>
- <tr>
- <td align="CENTER">Zl</td>
- <td>Separator, Line</td>
- </tr>
- <tr>
- <td align="CENTER">Zp</td>
- <td>Separator, Paragraph</td>
- </tr>
- <tr>
- <td align="CENTER">Cc</td>
- <td>Other, Control</td>
- </tr>
- <tr>
- <td align="CENTER">Cf</td>
- <td>Other, Format</td>
- </tr>
- <tr>
- <td align="CENTER">Cs</td>
- <td>Other, Surrogate</td>
- </tr>
- <tr>
- <td align="CENTER">Co</td>
- <td>Other, Private Use</td>
- </tr>
- <tr>
- <td align="CENTER">Cn</td>
- <td>Other, Not Assigned (no characters in the file have this property)</td>
- </tr>
-</table>
-<h4>Informative Categories</h4>
-<table border="0" cellspacing="2" cellpadding="0">
- <tr>
- <th>
- <p align="LEFT">Abbr.</th>
- <th>
- <p align="LEFT">Description</th>
- </tr>
- <tr>
- <td align="CENTER">Lm</td>
- <td>Letter, Modifier</td>
- </tr>
- <tr>
- <td align="CENTER">Lo</td>
- <td>Letter, Other</td>
- </tr>
- <tr>
<td align="CENTER">Pc</td>
<td>Punctuation, Connector</td>
</tr>
<td align="CENTER">So</td>
<td>Symbol, Other</td>
</tr>
+ <tr>
+ <td align="CENTER">Zs</td>
+ <td>Separator, Space</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Zl</td>
+ <td>Separator, Line</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Zp</td>
+ <td>Separator, Paragraph</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Cc</td>
+ <td>Other, Control</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Cf</td>
+ <td>Other, Format</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Cs</td>
+ <td>Other, Surrogate</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Co</td>
+ <td>Other, Private Use</td>
+ </tr>
+ <tr>
+ <td align="CENTER">Cn</td>
+ <td>Other, Not Assigned (no characters in the file have this property)</td>
+ </tr>
</table>
+<blockquote>
+ <p><b>Note:</b> The term "L&" is sometimes used to stand for
+ Uppercase, Lowercase or Titlecase letters (Lu, Ll, or Lt).</p>
+</blockquote>
<h3><a name="Bidirectional Category"></a>Bidirectional Category</h3>
<p>Please refer to Chapter 3 for an explanation of the algorithm for
Bidirectional Behavior and an explanation of the significance of these
categories. An up-to-date version can be found on <a
href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9:
-The Bidirectional Algorithm</a>. These values are normative.</p>
-<table border="0" cellpadding="2">
+The Bidirectional Algorithm</a>.</p>
+<table border="0" cellpadding="4" cellspacing="0">
<tr>
<th valign="TOP" align="LEFT">
<p align="LEFT">Type</th>
</tr>
</table>
<h3><a name="Character Decomposition"></a>Character Decomposition Mapping</h3>
-<p>The decomposition is a normative property of a character. The tags supplied
-with certain decomposition mappings generally indicate formatting information.
-Where no such tag is given, the mapping is designated as canonical. Conversely,
-the presence of a formatting tag also indicates that the mapping is a
-compatibility mapping and not a canonical mapping. In the absence of other
-formatting information in a compatibility mapping, the tag is used to
+<p>The tags supplied with certain decomposition mappings generally indicate
+formatting information. Where no such tag is given, the mapping is designated as
+canonical. Conversely, the presence of a formatting tag also indicates that the
+mapping is a compatibility mapping and not a canonical mapping. In the absence
+of other formatting information in a compatibility mapping, the tag is used to
distinguish it from canonical mappings.</p>
<p>In some instances a canonical mapping or a compatibility mapping may consist
of a single character. For a canonical mapping, this indicates that the
compatibility mapping, this indicates that the character is a compatibility
equivalent of another single character. The compatibility formatting tags used
are:</p>
-<table border="0" cellspacing="2" cellpadding="0">
+<table border="0" cellspacing="0" cellpadding="4">
<tr>
<th>Tag</th>
<th>
reordering algorithm.</li>
</ul>
<h3><a name="Canonical Combining Classes"></a>Canonical Combining Classes</h3>
-<table border="0" cellspacing="2" cellpadding="0">
+<table border="0" cellspacing="0" cellpadding="4">
<tr>
<th>
<p align="LEFT">Value</th>
Normalization</h3>
<p>Decomposition is specified in Chapter 3. <a
href="http://www.unicode.org/unicode/reports/tr15/"><i>Unicode Standard Annex
-#15: Unicode Normalization Forms</i></a> specifies the interaction between decomposition
-and normalization. The most up-to-date version is found on <a
-href="http://www.unicode.org/unicode/reports/tr15/">http://www.unicode.org/unicode/reports/tr15/</a>.
-That report specifies how the decompositions defined in UnicodeData.txt are used
-to derive normalized forms of Unicode text.</p>
+#15: Unicode Normalization Forms</i></a> specifies the interaction between
+decomposition and normalization. That report specifies how the decompositions
+defined in UnicodeData.txt are used to derive normalized forms of Unicode text.</p>
<p>Note that as of the 2.1.9 update of the Unicode Character Database, the
decompositions in the UnicodeData.txt file can be used to recursively derive the
full decomposition in canonical order, without the need to separately apply
canonical reordering. However, canonical reordering of combining character
-sequences must still be applied in decomposition when normalizing source text
-which contains any combining marks.</p>
+sequences <b><i>must</i></b> still be applied in decomposition when normalizing
+source text which contains any combining marks.</p>
<h3><a name="Case Mappings"></a>Case Mappings</h3>
-<p>The case mapping is an informative, default mapping. Case itself, on the
-other hand, has normative status. Thus, for example, 0041 LATIN CAPITAL LETTER A
-is normatively uppercase, but its lowercase mapping the 0061 LATIN SMALL LETTER
-A is informative. The reason for this is that case can be considered to be an
-inherent property of a particular character (and is usually, but not always,
-derivable from the presence of the terms "CAPITAL" or
-"SMALL" in the character name), but case mappings between characters
-are occasionally influenced by local conventions. For example, certain
-languages, such as Turkish, German, French, or Greek may have small deviations
-from the default mappings listed in UnicodeData.</p>
-<p>In addition to uppercase and lowercase, because of the inclusion of certain
-composite characters for compatibility, such as 01F1 LATIN CAPITAL LETTER DZ,
-there is a third case, called <i>titlecase</i>, which is used where the first
-letter of a word is to be capitalized (e.g. UPPERCASE, Titlecase, lowercase). An
-example of such a titlecase letter is 01F2 LATIN CAPITAL LETTER D WITH SMALL
-LETTER Z.</p>
-<p>The uppercase, titlecase and lowercase fields are only included for
-characters that have a single corresponding character of that type. Composite
-characters (such as "339D SQUARE CM") that do not have a single
-corresponding character of that type can be cased by decomposition.</p>
-<p>For compatibility with existing parsers, UnicodeData only contains case
+<p>There are a number of complications to case mappings that occur once the
+repertoire of characters is expanded beyond ASCII. For more information, see <a
+href="http://www.unicode.org/unicode/reports/tr21/">UTR #21: Case Mappings</a>.</p>
+<p>For compatibility with existing parsers, UnicodeData.txt only contains case
mappings for characters where they are one-to-one mappings; it also omits
information about context-sensitive case mappings. Information about these
-special cases can be found in a separate data file, <a
-href="http://www.unicode.org/Public/3.0-Update1/">SpecialCasing.txt</a>, which
-has been added starting with the 2.1.8 update to the Unicode data files.
-SpecialCasing.txt contains additional informative case mappings that are either
-not one-to-one or which are context-sensitive.</p>
+special cases can be found in a separate data file, SpecialCasing.txt.</p>
<h2><a name="Property Invariants"></a>Property Invariants</h2>
<p>Values in UnicodeData.txt are subject to correction as errors are found;
however, some characteristics of the categories themselves can be considered
invariants. Applications may wish to take these invariants into account when
-choosing how to implement character properties. The following is a partial list
-of known invariants for the Unicode Character Database.</p>
+choosing how to implement character properties. For more information, see <a
+href="http://www.unicode.org/unicode/standard/policies.html">Unicode Policies</a>.</p>
+<p>The following is a partial list of known invariants for the Unicode Character
+Database.</p>
<h4>Database Fields</h4>
<ul>
<li>The number of fields in UnicodeData.txt is fixed.</li>
</ul>
</li>
</ul>
-<h4>Case</h4>
-<ul>
- <li>Characters of type Lu, Lt, or Ll are called <i>cased</i>. All characters
- with an Upper, Lower, or Titlecase mapping are cased characters.
- <ul>
- <li>However, characters with the General Categories of Lu, Ll, or Lt may
- not always have case mappings, and case mappings may vary by locale.
- (See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt).</li>
- </ul>
- </li>
-</ul>
<h4>Canonical Decomposition</h4>
<ul>
<li>Canonical mappings are always in canonical order.</li>
<p>This section provides a summary of the changes between update versions of the
Unicode Standard.</p>
<h3><a
+href="http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode 3.1">Unicode
+3.1</a></h3>
+<p>Modifications made for Version 3.0.1 of UnicodeData.txt include:
+<ul>
+ <li>Addition of 2237 new entries, to cover new characters and new ranges of
+ unified Han characters encoded in Unicode 3.1.</li>
+ <li>Changed General Category value of 16EE..16F0 (Runic golden numbers) from
+ No to Nl.</li>
+</ul>
+<h3><a
href="http://www.unicode.org/unicode/standard/versions/enumeratedversions.html#Unicode 3.0.1">Unicode
3.0.1</a></h3>
<p>Modifications made for Version 3.0.1 of UnicodeData.txt include:
<li>Added categories Me, Sk, Pc, Nl, Cs, Cf, and rectified a number of
mistakes in the database.</li>
</ul>
+<h2><i><a name="UCD_Terms">UCD Terms of Use</a></i></h2>
+<h3><i>Disclaimer</i></h3>
+<blockquote>
+ <p><i>The Unicode Character Database is provided as is by Unicode, Inc. No
+ claims are made as to fitness for any particular purpose. No warranties of any
+ kind are expressed or implied. The recipient agrees to determine applicability
+ of information provided. If this file has been purchased on magnetic or
+ optical media from Unicode, Inc., the sole remedy for any claim will be
+ exchange of defective media within 90 days of receipt.</i></p>
+ <p><i>This disclaimer is applicable for all other data files accompanying the
+ Unicode Character Database, some of which have been compiled by the Unicode
+ Consortium, and some of which have been supplied by other sources.</i></p>
+</blockquote>
+<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
+<blockquote>
+ <p><i>Recipient is granted the right to make copies in any form for internal
+ distribution and to freely use the information supplied in the creation of
+ products supporting the Unicode<sup>TM</sup> Standard. The files in the
+ Unicode Character Database can be redistributed to third parties or other
+ organizations (whether for profit or not) as long as this notice and the
+ disclaimer notice are retained. Information can be extracted from these files
+ and used in documentation or programs, as long as there is an accompanying
+ notice indicating the source.</i></p>
+</blockquote>
+<hr width="50%">
+<div align="center">
+ <center>
+ <table cellspacing="0" cellpadding="0" border="0">
+ <tr>
+ <td><a href="http://www.unicode.org/unicode/copyright.html"><img
+ src="http://www.unicode.org/img/hb_home.gif" border="0" alt="Home"
+ width="40" height="49"><img src="http://www.unicode.org/img/hb_mid.gif"
+ border="0" alt="Terms of Use" width="152" height="49"><img
+ src="http://www.unicode.org/img/hb_mail.gif" border="0" alt="E-mail"
+ width="46" height="49"></a></td>
+ </tr>
+ </table>
+ </center>
+</div>
</body>
-ArabicShaping.txt ArabShap.txt
-BidiMirroring.txt BidiMirr.txt
-Blocks.txt Blocks.txt
-CaseFolding.txt CaseFold.txt
-CompositionExclusions.txt CompExcl.txt
-EastAsianWidth.txt EAWidth.txt
-Index.txt Index.txt
-Jamo.txt Jamo.txt
-LineBreak.txt LineBrk.txt
-NamesList.html NamesList.html
-NamesList.txt Names.txt
-PropList.txt PropList.txt
-ReadMe.txt ReadMe.txt
-SpecialCasing.txt SpecCase.txt
-UnicodeCharacterDatabase.html UCD.html
-UnicodeData.html Unicode.html
-UnicodeData.txt Unicode.txt
-
+ArabicShaping.txt ArabShap.txt
+BidiMirroring.txt BidiMirr.txt
+#Blocks.txt Blocks.txt
+CaseFolding.txt CaseFold.txt
+CompositionExclusions.txt CompExcl.txt
+EastAsianWidth.txt EAWidth.txt
+#Index.txt Index.txt
+#Jamo.txt Jamo.txt
+LineBreak.txt LineBrk.txt
+#NamesList.html NamesList.html
+#NamesList.txt NamesList.txt
+#PropList.txt PropList.txt
+#PropList.html PropList.html
+#ReadMe.txt ReadMe.txt
+SpecialCasing.txt SpecCase.txt
+UnicodeCharacterDatabase.html UCD.html
+UnicodeData.html Unicode.html
+UnicodeData.txt Unicode.txt
-3.1 beta 2001-03-23
-
+3.1