From: Jarkko Hietaniemi Date: Sat, 31 Mar 2001 00:52:42 +0000 (+0000) Subject: Update to Unicode 3.1. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=06bfd75b095ff68f02a5c20de85d0375301fc5f7;p=p5sagit%2Fp5-mst-13.2.git Update to Unicode 3.1. (Rename Names.txt to NamesList.txt.) p4raw-id: //depot/perl@9483 --- diff --git a/lib/unicode/Is/LbrkAI.pl b/lib/unicode/Is/LbrkAI.pl index 325c0a0..aeabcfa 100644 --- a/lib/unicode/Is/LbrkAI.pl +++ b/lib/unicode/Is/LbrkAI.pl @@ -58,7 +58,7 @@ return <<'END'; 0401 0410 044f 0451 -2016 +2015 2016 2020 2021 203b 2074 @@ -117,7 +117,7 @@ return <<'END'; 25b6 25b7 25bc 25bd 25c0 25c1 -25c6 25c7 +25c6 25c8 25cb 25ce 25d1 25e2 25e5 @@ -134,6 +134,5 @@ return <<'END'; 2667 266a 266c 266d 266f -e000 f8ff fffd END diff --git a/lib/unicode/Is/LbrkAL.pl b/lib/unicode/Is/LbrkAL.pl index 61938d7..b2b01b3 100644 --- a/lib/unicode/Is/LbrkAL.pl +++ b/lib/unicode/Is/LbrkAL.pl @@ -254,7 +254,6 @@ return <<'END'; 1fdd 1fef 1ff2 1ff4 1ff6 1ffe -2015 2017 2022 2023 2038 @@ -327,7 +326,7 @@ return <<'END'; 25b8 25bb 25be 25bf 25c2 25c5 -25c8 25ca +25c9 25ca 25cc 25cd 25d2 25e1 25e6 25ee @@ -372,7 +371,6 @@ fbd3 fd3d fd50 fd8f fd92 fdc7 fdf0 fdfb -fe6b fe70 fe72 fe74 fe76 fefc diff --git a/lib/unicode/Is/LbrkBA.pl b/lib/unicode/Is/LbrkBA.pl index ac7db79..fc5088b 100644 --- a/lib/unicode/Is/LbrkBA.pl +++ b/lib/unicode/Is/LbrkBA.pl @@ -5,7 +5,6 @@ return <<'END'; 0009 007c 00ad -00b4 058a 0f0b 1361 diff --git a/lib/unicode/Is/LbrkBB.pl b/lib/unicode/Is/LbrkBB.pl index e8b866d..0a33d5d 100644 --- a/lib/unicode/Is/LbrkBB.pl +++ b/lib/unicode/Is/LbrkBB.pl @@ -2,6 +2,7 @@ # This file is built by mktables.PL from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; +00b4 02c8 02cc 1806 diff --git a/lib/unicode/Is/LbrkID.pl b/lib/unicode/Is/LbrkID.pl index c0bd0e5..8821c93 100644 --- a/lib/unicode/Is/LbrkID.pl +++ b/lib/unicode/Is/LbrkID.pl @@ -50,15 +50,12 @@ return <<'END'; 3300 3376 337b 33dd 33e0 33fe -3400 4db5 -4e00 9fa5 a000 a48c a490 a4a1 a4a4 a4b3 a4b5 a4c0 a4c2 a4c4 a4c6 -ac00 d7a3 f900 fa2d fe30 fe34 fe49 fe4f @@ -66,6 +63,7 @@ fe51 fe58 fe5f fe66 fe68 +fe6b ff02 ff03 ff06 ff07 ff0a ff0b @@ -78,6 +76,5 @@ ff3e ff5a ff5c ff5e ffe2 ffe4 -20000 2a6d6 2f800 2fa1d END diff --git a/lib/unicode/Is/LbrkSG.pl b/lib/unicode/Is/LbrkSG.pl index 9cf13e9..272f63f 100644 --- a/lib/unicode/Is/LbrkSG.pl +++ b/lib/unicode/Is/LbrkSG.pl @@ -2,7 +2,4 @@ # This file is built by mktables.PL from e.g. Unicode.txt. # Any changes made here will be lost! return <<'END'; -d800 db7f -db80 dbff -dc00 dfff END diff --git a/lib/unicode/LineBrk.txt b/lib/unicode/LineBrk.txt index 8354ae6..672701b 100644 --- a/lib/unicode/LineBrk.txt +++ b/lib/unicode/LineBrk.txt @@ -16,11 +16,8 @@ # - Assigned characters that are not listed explicitly are given the value # "AL". # - Unassigned characters are given the value "XX". -# - Characters within ranges marked by "First>" and "Last>" are omitted, -# as in UnicodeData.txt. For example, the following means that -# all characters between 3400 and 4DB5 have the value "ID" -# 3400;ID; -# 4DB5;ID; +# - Characters ranges are specified as for other property files in +# the Unicode Character Database. # # The Unicode name of each character is provided in a comment for help # in identifying the characters. @@ -206,7 +203,7 @@ 00B1;PR # PLUS-MINUS SIGN 00B2;AI # SUPERSCRIPT TWO 00B3;AI # SUPERSCRIPT THREE -00B4;BA # ACUTE ACCENT +00B4;BB # ACUTE ACCENT 00B5;AL # MICRO SIGN 00B6;AI # PILCROW SIGN 00B7;AI # MIDDLE DOT @@ -5144,7 +5141,7 @@ 2012;BA # FIGURE DASH 2013;BA # EN DASH 2014;B2 # EM DASH -2015;AL # HORIZONTAL BAR +2015;AI # HORIZONTAL BAR 2016;AI # DOUBLE VERTICAL LINE 2017;AL # DOUBLE LOW LINE 2018;QU # LEFT SINGLE QUOTATION MARK @@ -6253,7 +6250,7 @@ 25C5;AL # WHITE LEFT-POINTING POINTER 25C6;AI # BLACK DIAMOND 25C7;AI # WHITE DIAMOND -25C8;AL # WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND +25C8;AI # WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND 25C9;AL # FISHEYE 25CA;AL # LOZENGE 25CB;AI # WHITE CIRCLE @@ -8037,10 +8034,8 @@ 33FC;ID # IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY TWENTY-NINE 33FD;ID # IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY 33FE;ID # IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE -3400;ID # -4DB5;ID # -4E00;ID # -9FA5;ID # +3400..4DB5;ID # .. +4E00..9FA5;ID # .. A000;ID # YI SYLLABLE IT A001;ID # YI SYLLABLE IX A002;ID # YI SYLLABLE I @@ -9256,16 +9251,11 @@ A4C2;ID # YI RADICAL SHOP A4C3;ID # YI RADICAL CHE A4C4;ID # YI RADICAL ZZIET A4C6;ID # YI RADICAL KE -AC00;ID # -D7A3;ID # -D800;SG # -DB7F;SG # -DB80;SG # -DBFF;SG # -DC00;SG # -DFFF;SG # -E000;AI # -F8FF;AI # +AC00..D7A3;ID # .. +D800..DB7F;SG # .. +DB80..DBFF;SG # .. +DC00..DFFF;SG # .. +E000..F8FF;XX # .. F900;ID # CJK COMPATIBILITY IDEOGRAPH-F900 F901;ID # CJK COMPATIBILITY IDEOGRAPH-F901 F902;ID # CJK COMPATIBILITY IDEOGRAPH-F902 @@ -10276,7 +10266,7 @@ FE66;ID # SMALL EQUALS SIGN FE68;ID # SMALL REVERSE SOLIDUS FE69;PR # SMALL DOLLAR SIGN FE6A;PO # SMALL PERCENT SIGN -FE6B;AL # SMALL COMMERCIAL AT +FE6B;ID # SMALL COMMERCIAL AT FE70;AL # ARABIC FATHATAN ISOLATED FORM FE71;AL # ARABIC TATWEEL WITH FATHATAN ABOVE FE72;AL # ARABIC DAMMATAN ISOLATED FORM @@ -12239,8 +12229,7 @@ FFFD;AI # REPLACEMENT CHARACTER 1D7FD;NU # MATHEMATICAL MONOSPACE DIGIT SEVEN 1D7FE;NU # MATHEMATICAL MONOSPACE DIGIT EIGHT 1D7FF;NU # MATHEMATICAL MONOSPACE DIGIT NINE -20000;ID # -2A6D6;ID # +20000..2A6D6;ID # .. 2F800;ID # CJK COMPATIBILITY IDEOGRAPH-2F800 2F801;ID # CJK COMPATIBILITY IDEOGRAPH-2F801 2F802;ID # CJK COMPATIBILITY IDEOGRAPH-2F802 @@ -12880,3 +12869,5 @@ E007C;CM # TAG VERTICAL LINE E007D;CM # TAG RIGHT CURLY BRACKET E007E;CM # TAG TILDE E007F;CM # CANCEL TAG +F0000..FFFFD;XX # .. +100000..10FFFD;XX # .. diff --git a/lib/unicode/NamesList.html b/lib/unicode/NamesList.html index 0bfc5db..fd9f21c 100644 --- a/lib/unicode/NamesList.html +++ b/lib/unicode/NamesList.html @@ -1,17 +1,94 @@ + + - -Unicode 3.0 NamesList File Structure + + + + + + +UCD: Unicode NamesList File Format + + - - -

Unicode NamesList File Format

- -

Last updated: 1999-07-06

- -

1.0 Introduction

+ + + + + + + + + +
+ + + + +
[Unicode]  Unicode Character + Database
+
 
+

Unicode NamesList File Format

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
Revision3.1
AuthorsAsmus Freytag
Date2001-02-26
This Versionhttp://www.unicode.org/Public/3.1-Update/NamesList-2.html
Previous Versionhttp://www.unicode.org/Public/3.0-Update/NamesList-1.html
Latest Versionhttp://www.unicode.org/Public/UNIDATA/NamesList.html
+

+
+Summary

+
+

This file describes the format and contents of NamesList.txt

+
+

Status

+
+

+The file and the files described herein are part of the Unicode Character Database +(UCD) +and are governed by the UCD Terms of Use stated at the end.

+
+
+ +

1.0 Introduction

The Unicode name list file NamesList.txt (also NamesList.lst) is a plain text file used to drive the layout of the character code charts in the Unicode Standard. The information @@ -85,12 +162,12 @@ CHAR_ENTRY: NAME_LINE | RESERVED_LINE | CHAR_ENTRY NOTICE -

In other words:
+

In other words:

-Neither TITLE nor  SUBTITLE may occur after the first BLOCKHEADER.

+Neither TITLE nor  SUBTITLE may occur after the first BLOCKHEADER.

-

Only TITLE, SUBTITLE, SUBHEADER, PAGEBREAK, COMMENT_LINE,  and IGNORED_LINE may -occur before the first BLOCKHEADER.

+

Only TITLE, SUBTITLE, SUBHEADER, PAGEBREAK, COMMENT_LINE,  and IGNORED_LINE may +occur before the first BLOCKHEADER.

Directly following either a NAME_LINE or a RESERVED_LINE an uninterrupted sequence of the following lines may occur (in any order and repeated as often as needed): ALIAS_LINE, @@ -168,7 +245,7 @@ EMPTY_LINE: LF // blank page, then output one or more charts // followed by the list of character names. // use BLOCKSTART and BLOCKEND to define the - // what characters belong to a block + // characters belonging to a block // use blockname in page and table headers "@@" <tab> BLOCKSTART <tab> BLOCKNAME COMMENT <tab> BLOCKEND // if a comment is present it replaces the blockname @@ -188,21 +265,37 @@ EMPTY_LINE: LF // character corresponding to char // If character is combining, it is replaced with // CHAR NBSP <circ> x NBSP where <circ> is the - // dotted circle - + // dotted circle + +

Notes: + +

+ +
    +
  • Blocks must be aligned on 16-code point boundary and contain an integer + multiple of code points. The exception to that rule is for blocks of + ideographs etc. for which no names are listed in the file. Such blocks must + end on the actual last character.
  • +
  • Blocks must be non-overlapping and in ascending order.  Namelines + must be in ascending order and following the block header for the block to + which they belong.
  • +
  • Reserved entries are optional, and will be supplied automatically. They + are required whenever followed by ALIAS_LINE, COMMENT_LINE or CROSS_REF
  • +

1.4 NamesList File Primitives

The following are the primitives and terminals for the NamesList syntax.

-
LINE:		STRING LF
-COMMENT:	"(" NAME ")"
-		"(" NAME ")" "*"
-
-NAME:	  	<sequence of ASCII characters, except "(" or ")" > 
+
LINE:		STRING LF
+COMMENT:		"(" NAME ")"
+		"(" NAME ")" "*" 
+BLOCKNAME:	<sequence of Latin-1 characters, except "(" and ")"> 
+NAME:	  	<sequence of uppercase ASCII letters, digit and hyphen> 
 STRING:	  	<sequence of Latin-1 characters> 
 CHAR:		X X X X
-		| X X X X X X X X X
+		| X X X X X
+		| X X X X X X
 X:	  	"0"|"1"|"2"|"3"|"4"|"5"|"6"|"7"|"8"|"9"|"A"|"B"|"C"|"D"|"E"|"F" 
 <tab>:	  	<sequence of one or more ASCII tab characters 0x09>	
 SP:	  	<ASCII 0x20>
@@ -213,14 +306,67 @@ COMMENT:	"(" NAME ")"
 
 
  • Special lookahead logic prevents a mention of a 4 digit standard, such as ISO 9999 from - being misinterpreted as ISO CHAR.
  • + being misinterpreted as ISO CHAR. The - in a character range CHAR-CHAR is + replaced by an EN DASH.
  • Use of Latin-1 is supported in unibook.exe, but not portably, unless the file is encoded as UTF-16LE.
  • The final LF in the file must be present
  • -
  • A CHAR inside ' or " is expanded, but only its glyph image is printed,  the - code value is not echoed
  • -
  • Straight quotes in an EXPAND_LINE are replaced by curly quotes using English rules. - Apostrophes are supported, but nested quotes are not.
  • +
  • A CHAR inside ' or " is expanded, but only its glyph image is printed,  + the + code value is not echoed.
  • +
  • Straight quotes in an EXPAND_LINE are replaced by curly quotes using English rules. + Apostrophes are supported, but nested quotes are not.
- - +

Modifications

+

Use of 4-6 digit hex notation is now supported.

+
+

+UCD Terms of Use

+

+Disclaimer

+
+

The Unicode Character Database is provided as is by Unicode, Inc. No + claims are made as to fitness for any particular purpose. No warranties of any + kind are expressed or implied. The recipient agrees to determine applicability + of information provided. If this file has been purchased on magnetic or + optical media from Unicode, Inc., the sole remedy for any claim will be + exchange of defective media within 90 days of receipt.

+

This disclaimer is applicable for all other data files accompanying the + Unicode Character Database, some of which have been compiled by the Unicode + Consortium, and some of which have been supplied by other sources.

+
+

Limitations on Rights to Redistribute This Data

+
+

Recipient is granted the right to make copies in any form for internal + distribution and to freely use the information supplied in the creation of + products supporting the UnicodeTM Standard. The files in the + Unicode Character Database can be redistributed to third parties or other + organizations (whether for profit or not) as long as this notice and the + disclaimer notice are retained. Information can be extracted from these files + and used in documentation or programs, as long as there is an accompanying + notice indicating the source.

+
+
+
+
+ + + + +
HomeTerms of UseE-mail
+ +
+
+ + + + + diff --git a/lib/unicode/Names.txt b/lib/unicode/NamesList.txt similarity index 100% rename from lib/unicode/Names.txt rename to lib/unicode/NamesList.txt diff --git a/lib/unicode/PropList.html b/lib/unicode/PropList.html new file mode 100644 index 0000000..665fc67 --- /dev/null +++ b/lib/unicode/PropList.html @@ -0,0 +1,252 @@ + + + + + + + + + + +UCD: Extended Character Properties + + + + + + + + + + + + +
+ + + + +
[Unicode]  Unicode Character + Database
+
 
+

Extended Character Properties

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
Revision3.1.0
AuthorsMark Davis
Date2001-02-28
This Versionhttp://www.unicode.org/Public/3.1-Update/PropList-3.1.0.html
Previous Versionn/a
Latest Versionhttp://www.unicode.org/Public/UNIDATA/PropList.html
+


+Summary

+
+

This document describes the format and content of the PropList.txt data + file in the Unicode Character Database (UCD).

+
+

Status

+
+

The file and the files described herein are part of the Unicode + Character Database and governed by the UCD Terms of Use + given below.

+

For general information on file formats and table formats, and the + implications of normative vs informative properties, see + UnicodeCharacterDatabase.html.

+

Warning: the information in this file does not completely + describe the use and interpretation of Unicode character properties and + behavior. It must be used in conjunction with the data in the other files in + the UCD, and relies on the notation and definitions supplied in The + Unicode Standard. All chapter references are to Version 3.1.0 of the + standard.

+
+
+

Introduction

+

PropList.txt contains extended properties that supplement the +General Category property described in UnicodeData.html. Unlike the derived +properties, the properties in PropList.txt cannot be derived directly from +UnicodeData.txt or other data files of the UCD. These properties are listed in +the following table.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Property ValueN/IDefinition and Usage
White_spaceNSpace characters and those format control characters + (such as TAB, CR and LF) which should be treated by programming + languages as "white space" for the purpose of parsing + elements. +

Note: ZERO WIDTH SPACE and ZERO WIDTH NO-BREAK SPACE are not + included, since their functions are restricted to line-break control. + Their names are unfortunately misleading in this respect.

+

Note: There are other senses of "whitespace" that + encompass a different set of characters.

+
Bidi_ControlNThose format control characters which have specific + functions in the Bidirectional Algorithm.
Join_ControlNThose format control characters which have specific + functions for control of cursive joining and ligation.
DashIThose punctuation characters explicitly called out as + dashes in the Unicode Standard, plus compatibility equivalents to those. + Most of these have the Pd General Category, but some have the Sm General + Category because of their use in mathematics.
HyphenIThose dashes used to mark connections between pieces of + words, plus the Katakana middle dot. The Katakana middle dot functions + like a hyphen, but is shaped like a dot rather than a dash.
Quotation_MarkIThose punctuation characters that function as quotation + marks.
Terminal_PunctuationIThose punctuation characters that generally mark the end + of textual units.
Other_MathIMath characters that do not have the Sm General Category.
Hex_DigitICharacters commonly used for the representation of + hexadecimal numbers, plus their compatibility equivalents.
Other_AlphabeticIAlphabetic characters that do not have L as their major + class for the General Category (Lu, Ll, Lt, Lm, Lo).
IdeographicICharacters considered to be CJKV (Chinese, Japanese, + Korean, and Vietnamese) ideographs.
DiacriticICharacters that linguistically modify the meaning of + another character to which they apply. Some diacritics are not combining + characters, and some combining characters are not diacritics.
ExtenderICharacters whose principal function is to extend the + value or shape of a preceding alphabetic character. Typical of these are + length and iteration marks.
Other_LowercaseILowercase characters that do not have the Ll General + Category.
Other_UppercaseIUppercase characters that do not have the Lu General + Category.
Noncharacter_Code_PointNCode points that are explicitly defined as illegal for + the encoding of characters. See Unicode 3.1 for + more information.
+
+
+


+UCD Terms of Use

+

Disclaimer

+
+

The Unicode Character Database is provided as is by Unicode, Inc. No + claims are made as to fitness for any particular purpose. No warranties of any + kind are expressed or implied. The recipient agrees to determine applicability + of information provided. If this file has been purchased on magnetic or + optical media from Unicode, Inc., the sole remedy for any claim will be + exchange of defective media within 90 days of receipt.

+

This disclaimer is applicable for all other data files accompanying the + Unicode Character Database, some of which have been compiled by the Unicode + Consortium, and some of which have been supplied by other sources.

+
+

Limitations on Rights to Redistribute This Data

+
+

Recipient is granted the right to make copies in any form for internal + distribution and to freely use the information supplied in the creation of + products supporting the UnicodeTM Standard. The files in the + Unicode Character Database can be redistributed to third parties or other + organizations (whether for profit or not) as long as this notice and the + disclaimer notice are retained. Information can be extracted from these files + and used in documentation or programs, as long as there is an accompanying + notice indicating the source.

+
+
+

HomeE-mail + + + + diff --git a/lib/unicode/UCD.html b/lib/unicode/UCD.html index 284349e..3634dc5 100644 --- a/lib/unicode/UCD.html +++ b/lib/unicode/UCD.html @@ -15,12 +15,29 @@ -

UNICODE CHARACTER DATABASE
-Version 3.0.1

+ + + + + + + +
+ + + + +
[Unicode]  Unicode Character + Database
+
 
+

UNICODE CHARACTER DATABASE

- + @@ -28,17 +45,17 @@ Version 3.0.1 - + + href="http://http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html">http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html + href="http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html">http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html @@ -46,155 +63,308 @@ Version 3.0.1 href="http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html">http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
Revision3.0.13.1.0
Authors
Date2000-08-172001-02-28
This Version http://www.unicode.org/Public/3.0-Update1/UnicodeCharacterDatabase-3.0.1.html
Previous Version http://www.unicode.org/Public/3.0-Update/UnicodeCharacterDatabase-3.0.0.html
Latest Version
-

Copyright © 1995-2000 Unicode, Inc. All Rights reserved.

-

Disclaimer

-

The Unicode Character Database is provided as is by Unicode, Inc. No claims -are made as to fitness for any particular purpose. No warranties of any kind are -expressed or implied. The recipient agrees to determine applicability of -information provided. If this file has been purchased on magnetic or optical -media from Unicode, Inc., the sole remedy for any claim will be exchange of -defective media within 90 days of receipt.

-

This disclaimer is applicable for all other data files accompanying the -Unicode Character Database, some of which have been compiled by the Unicode -Consortium, and some of which have been supplied by other sources.

-

Limitations on Rights to Redistribute This Data

-

Recipient is granted the right to make copies in any form for internal -distribution and to freely use the information supplied in the creation of -products supporting the UnicodeTM Standard. The files in the Unicode -Character Database can be redistributed to third parties or other organizations -(whether for profit or not) as long as this notice and the disclaimer notice are -retained. Information can be extracted from these files and used in -documentation or programs, as long as there is an accompanying notice indicating -the source.

+


+Summary

+
+

This document describes the format and content of the Unicode Character + Database (UCD)

+
+

Status

+
+

The file and the files described herein are part of the Unicode + Character Database and are governed by the UCD Terms of + Use given below.

+

The References provide related information + that is useful in understanding this document.

+

Warning: the information in this file does not completely + describe the use and interpretation of Unicode character properties and + behavior. It must be used in conjunction with the data in the other files in + the Unicode Character Database, and relies on the notation and definitions + supplied in The + Unicode Standard. All chapter references are to Version 3.1.0 of the + standard.

+

Introduction

-

The Unicode Character Database is a set of files that define the Unicode -character properties and internal mappings. For more information about character -properties and mappings, see The Unicode Standard.

-

The Unicode Character Database has been updated to reflect Version 3.0 of the -Unicode Standard, with many characters added to those published in Version 2.0. -A number of corrections have also been made to case mappings or other errors in -the database noted since the publication of Version 2.0. Normative bidirectional -properties have also been modified to reflect decisions of the Unicode Technical -Committee.

-

For more information on versions of the Unicode Standard and how to reference -them, see http://www.unicode.org/unicode/standard/versions/.

-

Conformance

-

Character properties may be either normative or informative. Normative -means that implementations that claim conformance to the Unicode Standard (at a -particular version) and which make use of a particular property or field must -follow the specifications of the standard for that property or field in order to -be conformant. The term normative when applied to a property or field of -the Unicode Character Database, does not mean that the value of that -field will never change. Corrections and extensions to the standard in the -future may require minor changes to normative values, even though the Unicode -Technical Committee strives to minimize such changes. An informative property -or field is strongly recommended, but a conformant implementation is free to use -or change such values as it may require while still being conformant to the -standard. Particular implementations may choose to override the properties and -mappings that are not normative. In that case, it is up to the implementer to -establish a protocol to convey that information.

-

Files

-

The following summarizes the files in the Unicode Character Database.  For -more information about these files, see the referenced technical report(s) or -section of Unicode Standard, Version 3.0.

-

UnicodeData.txt (Chapter 4, UTR #21: Case Mappings, UAX #15 Unicode Normalization -Forms) -

    -
  • The main file in the Unicode Character Database.
  • -
  • For detailed information on the format, see UnicodeData.html. - This file also characterizes which properties are normative and which are - informative.
  • -
-

PropList.txt (Chapter 4) -

    -
  • Additional informative properties list: Alphabetic, Ideographic, - and Mathematical, among others.
  • -
-

SpecialCasing.txt (Chapter 4, UTR #21: Case Mappings) -

    -
  • List of informative special casing properties, including one-to-many - mappings such as SHARP S => "SS", and locale-specific mappings, - such as for Turkish dotless i.
  • -
-

Blocks.txt (Chapter 14) -

    -
  • List of normative block names.
  • -
-

Jamo.txt (Chapter 4) -

    -
  • List of normative Jamo short names, used in deriving HANGUL SYLLABLE names - algorithmically.
  • -
-

ArabicShaping.txt (Section 8.2) -

    -
  • Basic Arabic and Syriac character shaping properties, such as initial, - medial and final shapes. These properties are normative for minimal shaping - of Arabic and Syriac.
  • -
-

NamesList.txt (Chapter 14) -

    -
  • This file duplicates some of the material in the UnicodeData file, and - adds informative annotations uses in the character charts, as printed in the - Unicode Standard.
  • -
  • Note: The information in NamesList.txt and Index.txt files matches - the appropriate version of the book. Changes in the Unicode Character - Database since then may not be reflected in these files, since they are - primarily of archival interest.
  • -
-

Index.txt (Chapter 14) -

    -
  • Informative index to Unicode characters, as printed in the Unicode - Standard
  • -
  • Note: The information in NamesList.txt and Index.txt files matches - the appropriate version of the book. Changes in the Unicode Character - Database since then may not be reflected in these files, since they are - primarily of archival interest.
  • -
-

CompositionExclusions.txt (UAX #15 Unicode Normalization -Forms) -

    -
  • Normative properties for normalization.
  • -
-

LineBreak.txt (UAX -#14: Line Breaking Properties) +

The Unicode Character Database (UCD) is a set of files that define the +Unicode character properties and internal mappings. This document describes the +files that are part of The +Unicode Standard, Version 3.1 [U3.1]. The main changes +in this version are:

    -
  • Normative and informative properties for line breaking. To see which - properties are informative and which are normative, consult UAX #14.
  • +
  • All of the data files have been updated to account for the large number of + additional characters in Unicode 3.1.
  • +
  • PropList.txt has been extensively reorganized and reformatted.
  • +
  • Scripts.txt has been added to the UCD.
  • +
  • A large number of informative derived property files have been added to + the UCD.
-

EastAsianWidth.txt (UAX -#11: East Asian Character Width) +

Files in the UCD use a common format unless otherwise specified. For +details, see UCD File Format.

+

Conformance

+

For information on the meaning and application of the terms normative and +informative, see "Chapter 4, Character Properties (revision)" in UAX #27, Unicode +3.1.

+

Some informative data files contain derived properties, properties that can +be derived from other properties in the UCD. The derived properties that are +computed from solely normative properties are themselves normative, while the +others are informative.

+

UCD Files

+

The following table summarizes the files in the Unicode Character Database. + For more information about these files, see the referenced technical +report(s), files, or section of Unicode Standard, Version 3.1.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
".txt" FileDescriptionN/ISummary
ArabicShapingSection 8.2NBasic Arabic and Syriac character shaping properties, such as initial, + medial and final shapes.
BidiMirroringUAX #9IProperties for substituting characters in an implementation of + bidirectional mirroring.
BlocksChapter 14NList of block names.
CaseFoldingUTR #21NMapping from characters to their case-folded forms. This is an + informative file containing normative derived properties. +

Derived from UnicodeData and SpecialCasing.

+
CompositionExclusionsUAX #15NProperties for normalization.
DerivedXXXDerivedProperties.htmlN/IVarious informative derived files, described in the documentation file. + Some of the derived properties are normative and some are informative.
EastAsianWidthUAX #11IProperties for determining the choice of wide vs. narrow glyphs in East + Asian contexts.
IndexChapter 14IIndex to Unicode characters, as printed in the Unicode Standard. (See Update Note.)
JamoChapter 4NList of Jamo short names, used in deriving HANGUL SYLLABLE names + algorithmically.
LineBreakUAX #14N/IProperties for line breaking.
NamesListChapter 14IThis file duplicates some of the material in the UnicodeData file, and + adds annotations used in the character charts.
NormalizationTestUAX #15NTest file for conformance to Unicode Normalization Forms.
PropListPropList.htmlN/IExtended character properties
ScriptsUTR #24IDefault scripts values for use in regular expressions.
SpecialCasingChapter 4,
+ UTR #21
NList of properties required for full case mapping.
UnicodeDataUnicodeData.html,
+ Chapter 4,
+ UTR #21,
+ UAX #15
N/IThe main file in the UCD. 
UnihanUnihan.txtN/IExtended properties of Han (CJK) characters. (See Format + Note.)
+
+

Update Note: The information in Index.txt + files matches the appropriate version of the book. Changes in the Unicode + Character Database since then may not be reflected in these files, since they + are primarily of archival interest.

+

Format Note: The file data format differs + from the standard format, and is described in the header of the file. The + header also describes which properties are informative and which are + normative.

+
+

UCD File Format

+

Files in the UCD use the following format, unless otherwise specified.

    -
  • Informative properties for determining the choice of wide vs. narrow - glyphs in East Asian contexts.
  • +
  • Each line of data consists of fields separated by semicolons. The fields + are numbered starting with zero. Code points are expressed as hexadecimal + numbers with four to six digits. They are written without "U+". + Within a sequence of code points, spaces are used for separation. Leading + and trailing spaces within a field are not significant.
-

BidiMirroring.txt (UAX #9: The -Bidirectional Algorithm)

    -
  • Informative properties for substituting characters in an implementation of - bidirectional mirroring.
  • +
  • The first field (0) of each line in the Unicode Character Database files + represents a code point or range. The remaining fields (1..n) are properties + associated with that code point.
-

CaseFolding.txt (UTR -#21: Case Mappings)

    -
  • Informative file mapping characters to their case-folded form.
  • +
  • A range of code points is specified by the form "X..Y". Each + code point from X to Y has the associated properties. For example:
-

NormalizationTest.txt (UAX #15 Unicode Normalization -Forms)

+
+
0000..007F; Basic Latin
+0080..00FF; Latin-1 Supplement
+
+1680      ; White_space # Zs OGHAM SPACE MARK
+2000..200A; White_space # Zs [11] EN QUAD..HAIR SPACE
+
    -
  • Normative test file for conformance to Unicode Normalization Forms.
  • +
  • Hash marks ("#") are used to indicate comments: all characters + from the hash mark to the end of the line are comments, and disregarded when + parsing data. In many files, the comments on data lines use a common format.
-

diffXvY.txt +

+
00BC..00BE ; numeric # No [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS
+
    -
  • Mechanically-generated informative files containing accumulated - differences between successive versions of UnicodeData.txt
  • +
  • The first part of the comment is the UCD general category. The symbol + "L&" indicates characters of type Lu, Ll, or Lt. The code + point ranges are calculated so that they all have the same General Category + (or L&). While this results in more ranges than are strictly necessary, + it makes the contents of the ranges clearer. The second part of the comment + (in square brackets), indicates the number of items in a range, if there is + one. The third part is the name of the character in field zero: if it is a + range, then the character names for the ends of the range are separated by + "..".
+

However, the comments are purely informational, and may change format or be +omitted in the future. They should not be parsed for content.

+

References

+ + + + + + + + + + + + + + + + + + + + + + + +
[FAQ]Unicode Frequently Asked Questions
+ http://www.unicode.org/unicode/faq/
+
For answers to common questions on technical issues.
[Glossary]Unicode Glossary
+ http://www.unicode.org/glossary/
+
For explanations of terminology used in this and other documents.
[Reports]Unicode Technical Reports
+ http://www.unicode.org/unicode/reports/
+
For information on the status and development process for + technical reports, and for a list of technical reports.
[U3.1]Unicode Standard Annex #27: Unicode 3.1
+ http://www.unicode.org/unicode/reports/tr27/
[Versions]Versions of the Unicode Standard
+ http://www.unicode.org/unicode/standard/versions/
+
For details on the precise contents of each version of the + Unicode Standard, and how to cite them.
+


+UCD Terms of Use

+

Disclaimer

+
+

The Unicode Character Database is provided as is by Unicode, Inc. No + claims are made as to fitness for any particular purpose. No warranties of any + kind are expressed or implied. The recipient agrees to determine applicability + of information provided. If this file has been purchased on magnetic or + optical media from Unicode, Inc., the sole remedy for any claim will be + exchange of defective media within 90 days of receipt.

+

This disclaimer is applicable for all other data files accompanying the + Unicode Character Database, some of which have been compiled by the Unicode + Consortium, and some of which have been supplied by other sources.

+
+

Limitations on Rights to Redistribute This Data

+
+

Recipient is granted the right to make copies in any form for internal + distribution and to freely use the information supplied in the creation of + products supporting the UnicodeTM Standard. The files in the + Unicode Character Database can be redistributed to third parties or other + organizations (whether for profit or not) as long as this notice and the + disclaimer notice are retained. Information can be extracted from these files + and used in documentation or programs, as long as there is an accompanying + notice indicating the source.

+
+
+

HomeE-mail diff --git a/lib/unicode/Unicode.html b/lib/unicode/Unicode.html index a8fbe32..7e6eb48 100644 --- a/lib/unicode/Unicode.html +++ b/lib/unicode/Unicode.html @@ -10,12 +10,29 @@ -

UnicodeData File Format
-Version 3.0.1

+ + + + + + + +
+ + + + +
[Unicode]  Unicode Character + Database
+
 
+

Unicode Data File Format

- + @@ -23,17 +40,17 @@ Version 3.0.1 - + + href="http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.html">http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.html + href="http://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.html">http://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.html @@ -41,9 +58,29 @@ Version 3.0.1 href="http://www.unicode.org/Public/UNIDATA/UnicodeData.html">http://www.unicode.org/Public/UNIDATA/UnicodeData.html
Revision3.0.13.1.0
Authors
Date2000-08-172001-02-28
This Version http://www.unicode.org/Public/3.0-Update1/UnicodeData-3.0.1.html
Previous Version http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
Latest Version
-

Copyright © 1995-2000 Unicode, Inc. All Rights reserved.
-For more information, including Disclamer and Limitations, see UnicodeCharacterDatabase-3.0.1.html

+


+Summary

+
+

This document describes the format and content of the UnicodeData.txt + file in the Unicode Character Database (UCD).

+
+

Status

+
+

The file and the files described herein are part of the Unicode + Character Database  and governed by the UCD Terms of + Use given below.

+

For general information on file formats and table formats, and the + implications of normative vs informative properties, see + UnicodeCharacterDatabase.html.

+

Warning: the information in this file does not completely + describe the use and interpretation of Unicode character properties and + behavior. It must be used in conjunction with the data in the other files in + the UCD, and relies on the notation and definitions supplied in The + Unicode Standard. All chapter references are to Version 3.1.0 of the + standard.

+
+

Introduction

This document describes the format of the UnicodeData.txt file, which is one of the files in the Unicode Character Database. The document is divided into the following sections: @@ -62,80 +99,68 @@ following sections:

  • Property Invariants
  • Modification History
  • -

    Warning: the information in this file does not completely describe the -use and interpretation of Unicode character properties and behavior. It must be -used in conjunction with the data in the other files in the Unicode Character Database, and -relies on the notation and definitions supplied in The -Unicode Standard. All chapter references are to Version 3.0 of the -standard.

    Field Formats

    -

    The file consists of lines containing fields separated by semicolons. Each -line represents the data for one encoded character in the Unicode Standard. -Every encoded character has a data entry, with the exception of certain special -ranges, as detailed below. +

    Each line represents the data for one encoded character in the Unicode +Standard. (For information on the file format, see UCD File Format in +UnicodeCharacterDatabase.html). +

    Every encoded character has a data entry, with the exception of certain +special ranges, as detailed below.

      -
    • There are nine special ranges of characters that are represented only by - their start and end characters, since the properties in the file are - uniform, except for code values (which are all sequential and assigned).
    • +
    • These ranges represented only by their start and end characters, since the + properties in the file are uniform, except for code values (which are all + sequential and assigned).
    • The names of CJK ideograph characters and the names and decompositions of Hangul syllable characters are algorithmically derivable. (See the Unicode Standard and Unicode Standard Annex #15 for more information).
    • Surrogate code values and private use characters have no names.
    • -
    • The Private Use character outside of the BMP (U+F0000..U+FFFFD, - U+100000..U+10FFFD) are listed as distinct ranges. These correspond to surrogate pairs +
    • The supplementary Private Use characters (U+F0000 .. U+FFFFD, U+100000 .. + U+10FFFD) are listed as distinct ranges. These correspond to surrogate pairs where the first surrogate is in the High Surrogate Private Use section.

    The exact ranges represented by start and end characters are:

      -
    • CJK Ideographs Extension A (U+3400 - U+4DB5)
    • -
    • CJK Ideographs (U+4E00 - U+9FA5)
    • -
    • Hangul Syllables (U+AC00 - U+D7A3)
    • -
    • Non-Private Use High Surrogates (U+D800 - U+DB7F)
    • -
    • Private Use High Surrogates (U+DB80 - U+DBFF)
    • -
    • Low Surrogates (U+DC00 - U+DFFF)
    • -
    • The Private Use Area (U+E000 - U+F8FF)
    • -
    • Plane 15 Private Use Area (U+F0000 - U+FFFFD)
    • -
    • Plane 16 Private Use Area (U+100000 - U+10FFFD)
    • +
    • CJK Ideographs Extension A (U+3400 .. U+4DB5)
    • +
    • CJK Ideographs (U+4E00 .. U+9FA5)
    • +
    • Hangul Syllables (U+AC00 .. U+D7A3)
    • +
    • Non-Private Use High Surrogates (U+D800 .. U+DB7F)
    • +
    • Private Use High Surrogates (U+DB80 .. U+DBFF)
    • +
    • Low Surrogates (U+DC00 .. U+DFFF)
    • +
    • The Private Use Area (U+E000 .. U+F8FF)
    • +
    • CJK Ideographs Extension B (U+20000 .. U+2A6D6)
    • +
    • Plane 15 Private Use Area (U+F0000 .. U+FFFFD)
    • +
    • Plane 16 Private Use Area (U+100000 .. U+10FFFD)

    The following table describes the format and meaning of each field in a data -entry in the UnicodeData file. Fields which contain normative information are so -indicated.

    +entry in the UnicodeData file.

    - + - - + + - + - + @@ -144,7 +169,7 @@ indicated.

    - + @@ -152,7 +177,7 @@ indicated.

    - + - + - + @@ -180,7 +205,7 @@ indicated.

    - + - + - + - + + is only provided when it is significantly different from the current name + for the character. - - + + - + + alphabet with case distinctions, and has a simple upper case equivalent, + then the upper case equivalent is in this field. See the explanation below + on case distinctions. These mappings are always one-to-one, not + one-to-many or many-to-one. +

    For full case mappings, see UTR #21 and + SpecialCasing.txt.

    + - + - +

    Field

    Name

    -

    Status

    +

    N/I

    Explanation

    0 Code valuenormativeCode value. For characters in the range U+0000..U+FFFD - the code value uses a 4-digit hexadecimal format; for characters in the - range U+10000..U+FFFFD the code value uses a 5-digit hexadecimal format; - and for characters in the range U+100000..U+10FFFD the code value uses a - 6-digit hexadecimal format.NCode value.
    1 Character namenormativeN These names match exactly the names published in Chapter 14 of the Unicode Standard, Version 3.0.
    2 General Categorynormative / informative
    - (see below)
    N This is a useful breakdown into various "character types" which can be used as a default categorization in implementations. See below for a brief explanation. 3 Canonical Combining ClassesnormativeN The classes used for the Canonical Ordering Algorithm in the Unicode Standard. These classes are also printed in Chapter 4 of the Unicode Standard.
    4 Bidirectional CategorynormativeN See the list below for an explanation of the abbreviations used in this field. These are the categories required by the Bidirectional Behavior Algorithm in the Unicode Standard. These categories are @@ -162,7 +187,7 @@ indicated.

    5 Character Decomposition MappingnormativeN In the Unicode Standard, not all of the mappings are full (maximal) decompositions. Recursive application of look-up for decompositions will, in all cases, lead to a maximal decomposition. The @@ -172,7 +197,7 @@ indicated.

    6 Decimal digit valuenormativeN This is a numeric field. If the character has the decimal digit property, as specified in Chapter 4 of the Unicode Standard, the value of that digit is represented with an integer value in this field
    7 Digit valuenormativeN This is a numeric field. If the character represents a digit, not necessarily a decimal digit, the value is here. This covers digits which do not form decimal radix forms, such as the compatibility @@ -189,7 +214,7 @@ indicated.

    8 Numeric valuenormativeN This is a numeric field. If the character has the numeric property, as specified in Chapter 4 of the Unicode Standard, the value of that character is represented with an integer or rational number in this @@ -200,7 +225,7 @@ indicated.

    9 MirrorednormativeN If the character has been identified as a "mirrored" character in bidirectional text, this field has the value "Y"; otherwise "N". The list of mirrored @@ -209,51 +234,58 @@ indicated.

    10 Unicode 1.0 NameinformativeI This is the old name as published in Unicode 1.0. This name - is only provided when it is significantly different from the Unicode 3.0 - name for the character.
    11 10646 comment fieldinformativeThis is the ISO 10646 comment field. It appears in parentheses - in the 10646 names list, or contains an asterisk to mark an Annex P note.IThis is the ISO 10646 comment field. It appears in + parentheses in the 10646 names list, or contains an asterisk to mark an + Annex P note.
    12 Uppercase MappinginformativeN Upper case equivalent mapping. If a character is part of an - alphabet with case distinctions, and has an upper case equivalent, then - the upper case equivalent is in this field. See the explanation below on - case distinctions. These mappings are always one-to-one, not one-to-many - or many-to-one. This field is informative.
    13 Lowercase MappinginformativeN Similar to Uppercase mapping
    14 Titlecase MappinginformativeN Similar to Uppercase mapping

    General Category

    -

    The values in this field are abbreviations for the following. Some of the -values are normative, and some are informative. For more information, see the -Unicode Standard.

    -

    Note: the standard does not assign information to control characters -(except for certain cases in the Bidirectional Algorithm). Implementations will -generally also assign categories to certain control characters, notably CR and -LF, according to platform conventions.

    -

    Normative Categories

    - +

    The values in this field are abbreviations for the following values. For more +information, see the Unicode Standard.

    +
    +

    Note: the standard does not assign information to control characters + (except for certain cases in the Bidirectional Algorithm). Implementations + will generally also assign categories to certain control characters, notably + CR and LF, according to platform conventions. See UAX #13: Unicode Newline + Guidelines for more information.

    +
    +
    @@ -273,6 +305,14 @@ LF, according to platform conventions.

    + + + + + + + + @@ -297,55 +337,6 @@ LF, according to platform conventions.

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    Abbr.

    Letter, Titlecase
    LmLetter, Modifier
    LoLetter, Other
    Mn Mark, Non-Spacing
    Number, Other
    ZsSeparator, Space
    ZlSeparator, Line
    ZpSeparator, Paragraph
    CcOther, Control
    CfOther, Format
    CsOther, Surrogate
    CoOther, Private Use
    CnOther, Not Assigned (no characters in the file have this property)
    -

    Informative Categories

    - - - - - - - - - - - - - - @@ -389,14 +380,50 @@ LF, according to platform conventions.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    -

    Abbr.

    -

    Description

    LmLetter, Modifier
    LoLetter, Other
    Pc Punctuation, Connector
    So Symbol, Other
    ZsSeparator, Space
    ZlSeparator, Line
    ZpSeparator, Paragraph
    CcOther, Control
    CfOther, Format
    CsOther, Surrogate
    CoOther, Private Use
    CnOther, Not Assigned (no characters in the file have this property)
    +
    +

    Note: The term "L&" is sometimes used to stand for + Uppercase, Lowercase or Titlecase letters (Lu, Ll, or Lt).

    +

    Bidirectional Category

    Please refer to Chapter 3 for an explanation of the algorithm for Bidirectional Behavior and an explanation of the significance of these categories. An up-to-date version can be found on Unicode Standard Annex #9: -The Bidirectional Algorithm. These values are normative.

    - +The Bidirectional Algorithm.

    +
    @@ -481,12 +508,11 @@ The Bidirectional Algorithm. These values are normative.

    Type

    Character Decomposition Mapping

    -

    The decomposition is a normative property of a character. The tags supplied -with certain decomposition mappings generally indicate formatting information. -Where no such tag is given, the mapping is designated as canonical. Conversely, -the presence of a formatting tag also indicates that the mapping is a -compatibility mapping and not a canonical mapping. In the absence of other -formatting information in a compatibility mapping, the tag is used to +

    The tags supplied with certain decomposition mappings generally indicate +formatting information. Where no such tag is given, the mapping is designated as +canonical. Conversely, the presence of a formatting tag also indicates that the +mapping is a compatibility mapping and not a canonical mapping. In the absence +of other formatting information in a compatibility mapping, the tag is used to distinguish it from canonical mappings.

    In some instances a canonical mapping or a compatibility mapping may consist of a single character. For a canonical mapping, this indicates that the @@ -494,7 +520,7 @@ character is a canonical equivalent of another single character. For a compatibility mapping, this indicates that the character is a compatibility equivalent of another single character. The compatibility formatting tags used are:

    - +
    Tag @@ -577,7 +603,7 @@ in Chapter 3 to use those mappings recursively. reordering algorithm.

    Canonical Combining Classes

    - +
    @@ -695,52 +721,31 @@ currently have members but are specified here for completeness.

    Normalization

    Decomposition is specified in Chapter 3. Unicode Standard Annex -#15: Unicode Normalization Forms specifies the interaction between decomposition -and normalization. The most up-to-date version is found on http://www.unicode.org/unicode/reports/tr15/. -That report specifies how the decompositions defined in UnicodeData.txt are used -to derive normalized forms of Unicode text.

    +#15: Unicode Normalization Forms specifies the interaction between +decomposition and normalization. That report specifies how the decompositions +defined in UnicodeData.txt are used to derive normalized forms of Unicode text.

    Note that as of the 2.1.9 update of the Unicode Character Database, the decompositions in the UnicodeData.txt file can be used to recursively derive the full decomposition in canonical order, without the need to separately apply canonical reordering. However, canonical reordering of combining character -sequences must still be applied in decomposition when normalizing source text -which contains any combining marks.

    +sequences must still be applied in decomposition when normalizing +source text which contains any combining marks.

    Case Mappings

    -

    The case mapping is an informative, default mapping. Case itself, on the -other hand, has normative status. Thus, for example, 0041 LATIN CAPITAL LETTER A -is normatively uppercase, but its lowercase mapping the 0061 LATIN SMALL LETTER -A is informative. The reason for this is that case can be considered to be an -inherent property of a particular character (and is usually, but not always, -derivable from the presence of the terms "CAPITAL" or -"SMALL" in the character name), but case mappings between characters -are occasionally influenced by local conventions. For example, certain -languages, such as Turkish, German, French, or Greek may have small deviations -from the default mappings listed in UnicodeData.

    -

    In addition to uppercase and lowercase, because of the inclusion of certain -composite characters for compatibility, such as 01F1 LATIN CAPITAL LETTER DZ, -there is a third case, called titlecase, which is used where the first -letter of a word is to be capitalized (e.g. UPPERCASE, Titlecase, lowercase). An -example of such a titlecase letter is 01F2 LATIN CAPITAL LETTER D WITH SMALL -LETTER Z.

    -

    The uppercase, titlecase and lowercase fields are only included for -characters that have a single corresponding character of that type. Composite -characters (such as "339D SQUARE CM") that do not have a single -corresponding character of that type can be cased by decomposition.

    -

    For compatibility with existing parsers, UnicodeData only contains case +

    There are a number of complications to case mappings that occur once the +repertoire of characters is expanded beyond ASCII. For more information, see UTR #21: Case Mappings.

    +

    For compatibility with existing parsers, UnicodeData.txt only contains case mappings for characters where they are one-to-one mappings; it also omits information about context-sensitive case mappings. Information about these -special cases can be found in a separate data file, SpecialCasing.txt, which -has been added starting with the 2.1.8 update to the Unicode data files. -SpecialCasing.txt contains additional informative case mappings that are either -not one-to-one or which are context-sensitive.

    +special cases can be found in a separate data file, SpecialCasing.txt.

    Property Invariants

    Values in UnicodeData.txt are subject to correction as errors are found; however, some characteristics of the categories themselves can be considered invariants. Applications may wish to take these invariants into account when -choosing how to implement character properties. The following is a partial list -of known invariants for the Unicode Character Database.

    +choosing how to implement character properties. For more information, see Unicode Policies.

    +

    The following is a partial list of known invariants for the Unicode Character +Database.

    Database Fields

    • The number of fields in UnicodeData.txt is fixed.
    • @@ -792,17 +797,6 @@ of known invariants for the Unicode Character Database.

    -

    Case

    -
      -
    • Characters of type Lu, Lt, or Ll are called cased. All characters - with an Upper, Lower, or Titlecase mapping are cased characters. -
        -
      • However, characters with the General Categories of Lu, Ll, or Lt may - not always have case mappings, and case mappings may vary by locale. - (See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt).
      • -
      -
    • -

    Canonical Decomposition

    • Canonical mappings are always in canonical order.
    • @@ -832,6 +826,16 @@ of known invariants for the Unicode Character Database.

      This section provides a summary of the changes between update versions of the Unicode Standard.

      Unicode +3.1

      +

      Modifications made for Version 3.0.1 of UnicodeData.txt include: +

        +
      • Addition of 2237 new entries, to cover new characters and new ranges of + unified Han characters encoded in Unicode 3.1.
      • +
      • Changed General Category value of 16EE..16F0 (Runic golden numbers) from + No to Nl.
      • +
      +

      Unicode 3.0.1

      Modifications made for Version 3.0.1 of UnicodeData.txt include: @@ -1019,6 +1023,45 @@ Version 2.0 include:

    • Added categories Me, Sk, Pc, Nl, Cs, Cf, and rectified a number of mistakes in the database.
    +

    UCD Terms of Use

    +

    Disclaimer

    +
    +

    The Unicode Character Database is provided as is by Unicode, Inc. No + claims are made as to fitness for any particular purpose. No warranties of any + kind are expressed or implied. The recipient agrees to determine applicability + of information provided. If this file has been purchased on magnetic or + optical media from Unicode, Inc., the sole remedy for any claim will be + exchange of defective media within 90 days of receipt.

    +

    This disclaimer is applicable for all other data files accompanying the + Unicode Character Database, some of which have been compiled by the Unicode + Consortium, and some of which have been supplied by other sources.

    +
    +

    Limitations on Rights to Redistribute This Data

    +
    +

    Recipient is granted the right to make copies in any form for internal + distribution and to freely use the information supplied in the creation of + products supporting the UnicodeTM Standard. The files in the + Unicode Character Database can be redistributed to third parties or other + organizations (whether for profit or not) as long as this notice and the + disclaimer notice are retained. Information can be extracted from these files + and used in documentation or programs, as long as there is an accompanying + notice indicating the source.

    +
    +
    +
    +
    +

    Value

    + + + +
    HomeTerms of UseE-mail
    + + diff --git a/lib/unicode/rename b/lib/unicode/rename index 3fcfbcc..71c1857 100644 --- a/lib/unicode/rename +++ b/lib/unicode/rename @@ -1,18 +1,18 @@ -ArabicShaping.txt ArabShap.txt -BidiMirroring.txt BidiMirr.txt -Blocks.txt Blocks.txt -CaseFolding.txt CaseFold.txt -CompositionExclusions.txt CompExcl.txt -EastAsianWidth.txt EAWidth.txt -Index.txt Index.txt -Jamo.txt Jamo.txt -LineBreak.txt LineBrk.txt -NamesList.html NamesList.html -NamesList.txt Names.txt -PropList.txt PropList.txt -ReadMe.txt ReadMe.txt -SpecialCasing.txt SpecCase.txt -UnicodeCharacterDatabase.html UCD.html -UnicodeData.html Unicode.html -UnicodeData.txt Unicode.txt - +ArabicShaping.txt ArabShap.txt +BidiMirroring.txt BidiMirr.txt +#Blocks.txt Blocks.txt +CaseFolding.txt CaseFold.txt +CompositionExclusions.txt CompExcl.txt +EastAsianWidth.txt EAWidth.txt +#Index.txt Index.txt +#Jamo.txt Jamo.txt +LineBreak.txt LineBrk.txt +#NamesList.html NamesList.html +#NamesList.txt NamesList.txt +#PropList.txt PropList.txt +#PropList.html PropList.html +#ReadMe.txt ReadMe.txt +SpecialCasing.txt SpecCase.txt +UnicodeCharacterDatabase.html UCD.html +UnicodeData.html Unicode.html +UnicodeData.txt Unicode.txt diff --git a/lib/unicode/version b/lib/unicode/version index 4e60e6c..8c50098 100644 --- a/lib/unicode/version +++ b/lib/unicode/version @@ -1,2 +1 @@ -3.1 beta 2001-03-23 - +3.1