From: Karl Williamson Date: Sun, 6 Dec 2009 05:21:38 +0000 (-0700) Subject: qr/\X/ expansion X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=37e2e78edfe0a224b8a615820f46db879584f523;p=p5sagit%2Fp5-mst-13.2.git qr/\X/ expansion --- diff --git a/MANIFEST b/MANIFEST index e36bd4b..fc94362 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3593,6 +3593,7 @@ lib/Unicode/README Explanation what happened to lib/unicode. lib/Unicode/UCD.pm Unicode character database lib/Unicode/UCD.t See if Unicode character database works lib/unicore/ArabicShaping.txt Unicode character database +lib/unicore/auxiliary/GCBTest.txt Unicode character database lib/unicore/auxiliary/GraphemeBreakProperty.txt Unicode character database lib/unicore/auxiliary/SentenceBreakProperty.txt Unicode character database lib/unicore/auxiliary/WordBreakProperty.txt Unicode character database diff --git a/embed.fnc b/embed.fnc index 575c4df..6b24d1e 100644 --- a/embed.fnc +++ b/embed.fnc @@ -502,6 +502,16 @@ ApR |bool |is_utf8_print |NN const U8 *p ApR |bool |is_utf8_punct |NN const U8 *p ApR |bool |is_utf8_xdigit |NN const U8 *p ApR |bool |is_utf8_mark |NN const U8 *p +pR |bool |is_utf8_X_begin |NN const U8 *p +pR |bool |is_utf8_X_extend |NN const U8 *p +pR |bool |is_utf8_X_prepend |NN const U8 *p +pR |bool |is_utf8_X_non_hangul |NN const U8 *p +pR |bool |is_utf8_X_L |NN const U8 *p +pR |bool |is_utf8_X_LV |NN const U8 *p +pR |bool |is_utf8_X_LVT |NN const U8 *p +pR |bool |is_utf8_X_LV_LVT_V |NN const U8 *p +pR |bool |is_utf8_X_T |NN const U8 *p +pR |bool |is_utf8_X_V |NN const U8 *p : Used in perly.y p |OP* |jmaybe |NN OP *o : Used in pp.c diff --git a/embed.h b/embed.h index 2af070d..e30e6f2 100644 --- a/embed.h +++ b/embed.h @@ -392,6 +392,16 @@ #define is_utf8_xdigit Perl_is_utf8_xdigit #define is_utf8_mark Perl_is_utf8_mark #ifdef PERL_CORE +#define is_utf8_X_begin Perl_is_utf8_X_begin +#define is_utf8_X_extend Perl_is_utf8_X_extend +#define is_utf8_X_prepend Perl_is_utf8_X_prepend +#define is_utf8_X_non_hangul Perl_is_utf8_X_non_hangul +#define is_utf8_X_L Perl_is_utf8_X_L +#define is_utf8_X_LV Perl_is_utf8_X_LV +#define is_utf8_X_LVT Perl_is_utf8_X_LVT +#define is_utf8_X_LV_LVT_V Perl_is_utf8_X_LV_LVT_V +#define is_utf8_X_T Perl_is_utf8_X_T +#define is_utf8_X_V Perl_is_utf8_X_V #define jmaybe Perl_jmaybe #define keyword Perl_keyword #endif @@ -2785,6 +2795,16 @@ #define is_utf8_xdigit(a) Perl_is_utf8_xdigit(aTHX_ a) #define is_utf8_mark(a) Perl_is_utf8_mark(aTHX_ a) #ifdef PERL_CORE +#define is_utf8_X_begin(a) Perl_is_utf8_X_begin(aTHX_ a) +#define is_utf8_X_extend(a) Perl_is_utf8_X_extend(aTHX_ a) +#define is_utf8_X_prepend(a) Perl_is_utf8_X_prepend(aTHX_ a) +#define is_utf8_X_non_hangul(a) Perl_is_utf8_X_non_hangul(aTHX_ a) +#define is_utf8_X_L(a) Perl_is_utf8_X_L(aTHX_ a) +#define is_utf8_X_LV(a) Perl_is_utf8_X_LV(aTHX_ a) +#define is_utf8_X_LVT(a) Perl_is_utf8_X_LVT(aTHX_ a) +#define is_utf8_X_LV_LVT_V(a) Perl_is_utf8_X_LV_LVT_V(aTHX_ a) +#define is_utf8_X_T(a) Perl_is_utf8_X_T(aTHX_ a) +#define is_utf8_X_V(a) Perl_is_utf8_X_V(aTHX_ a) #define jmaybe(a) Perl_jmaybe(aTHX_ a) #define keyword(a,b,c) Perl_keyword(aTHX_ a,b,c) #endif diff --git a/embedvar.h b/embedvar.h index e805a79..63ed46e 100644 --- a/embedvar.h +++ b/embedvar.h @@ -324,6 +324,16 @@ #define PL_unitcheckav_save (vTHX->Iunitcheckav_save) #define PL_unlockhook (vTHX->Iunlockhook) #define PL_unsafe (vTHX->Iunsafe) +#define PL_utf8_X_L (vTHX->Iutf8_X_L) +#define PL_utf8_X_LV (vTHX->Iutf8_X_LV) +#define PL_utf8_X_LVT (vTHX->Iutf8_X_LVT) +#define PL_utf8_X_LV_LVT_V (vTHX->Iutf8_X_LV_LVT_V) +#define PL_utf8_X_T (vTHX->Iutf8_X_T) +#define PL_utf8_X_V (vTHX->Iutf8_X_V) +#define PL_utf8_X_begin (vTHX->Iutf8_X_begin) +#define PL_utf8_X_extend (vTHX->Iutf8_X_extend) +#define PL_utf8_X_non_hangul (vTHX->Iutf8_X_non_hangul) +#define PL_utf8_X_prepend (vTHX->Iutf8_X_prepend) #define PL_utf8_alnum (vTHX->Iutf8_alnum) #define PL_utf8_alpha (vTHX->Iutf8_alpha) #define PL_utf8_ascii (vTHX->Iutf8_ascii) @@ -641,6 +651,16 @@ #define PL_Iunitcheckav_save PL_unitcheckav_save #define PL_Iunlockhook PL_unlockhook #define PL_Iunsafe PL_unsafe +#define PL_Iutf8_X_L PL_utf8_X_L +#define PL_Iutf8_X_LV PL_utf8_X_LV +#define PL_Iutf8_X_LVT PL_utf8_X_LVT +#define PL_Iutf8_X_LV_LVT_V PL_utf8_X_LV_LVT_V +#define PL_Iutf8_X_T PL_utf8_X_T +#define PL_Iutf8_X_V PL_utf8_X_V +#define PL_Iutf8_X_begin PL_utf8_X_begin +#define PL_Iutf8_X_extend PL_utf8_X_extend +#define PL_Iutf8_X_non_hangul PL_utf8_X_non_hangul +#define PL_Iutf8_X_prepend PL_utf8_X_prepend #define PL_Iutf8_alnum PL_utf8_alnum #define PL_Iutf8_alpha PL_utf8_alpha #define PL_Iutf8_ascii PL_utf8_ascii diff --git a/intrpvar.h b/intrpvar.h index 650eb62..8fe641c 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -531,6 +531,16 @@ PERLVAR(Iutf8_print, SV *) PERLVAR(Iutf8_punct, SV *) PERLVAR(Iutf8_xdigit, SV *) PERLVAR(Iutf8_mark, SV *) +PERLVAR(Iutf8_X_begin, SV *) +PERLVAR(Iutf8_X_extend, SV *) +PERLVAR(Iutf8_X_prepend, SV *) +PERLVAR(Iutf8_X_non_hangul, SV *) +PERLVAR(Iutf8_X_L, SV *) +PERLVAR(Iutf8_X_LV, SV *) +PERLVAR(Iutf8_X_LVT, SV *) +PERLVAR(Iutf8_X_T, SV *) +PERLVAR(Iutf8_X_V, SV *) +PERLVAR(Iutf8_X_LV_LVT_V, SV *) PERLVAR(Iutf8_toupper, SV *) PERLVAR(Iutf8_totitle, SV *) PERLVAR(Iutf8_tolower, SV *) diff --git a/lib/unicore/README.perl b/lib/unicore/README.perl index 7515825..59d66a8 100644 --- a/lib/unicore/README.perl +++ b/lib/unicore/README.perl @@ -5,16 +5,17 @@ The *.txt files were copied from with subdirectories 'extracted' and 'auxiliary' The Unihan files were not included due to space considerations. Also NOT -included were any *.html files and *Test.txt files. It is possible to add the -Unihan files, and edit mktables (see instructions near its beginning) to look -at them. +included were any *.html files. It is possible to add the Unihan files, and +edit mktables (see instructions near its beginning) to look at them. The file 'version' should exist and be a single line with the Unicode version, like: 5.2.0 To be 8.3 filesystem friendly, the names of some of the input files have been -changed from the values that are in the Unicode DB: +changed from the values that are in the Unicode DB. Not all of the Test files +are currently used, so may not be present, so some of the mv's can fail. The +.html Test files are not touched. mv PropertyValueAliases.txt PropValueAliases.txt mv NamedSequencesProv.txt NamedSqProv.txt @@ -33,6 +34,11 @@ mv extracted/DerivedLineBreak.txt extracted/DLineBreak.txt mv extracted/DerivedNumericType.txt extracted/DNumType.txt mv extracted/DerivedNumericValues.txt extracted/DNumValues.txt +mv auxiliary/GraphemeBreakTest.txt auxiliary/GCBTest.txt +mv auxiliary/LineBreakTest.txt auxiliary/LBTest.txt +mv auxiliary/SentenceBreakTest.txt auxiliary/SBTest.txt +mv auxiliary/WordBreakTest.txt auxiliary/WBTest.txt + If you have the Unihan database (5.2 and above), you should also do the following: @@ -45,9 +51,9 @@ mv Unihan_RadicalStrokeCounts.txt UnihanRadicalStrokeCounts.txt mv Unihan_Readings.txt UnihanReadings.txt mv Unihan_Variants.txt UnihanVariants.txt -If you download everything, the names of files, such as test files, that are -not used by mktables are not changed by the above, and will not work correctly -as-is on 8.3 filesystems. +If you download everything, the names of files that are not used by mktables +are not changed by the above, and will not work correctly as-is on 8.3 +filesystems. mktables is used to generate the tables used by the rest of Perl. It will warn you about any *.txt files in the directory substructure that it doesn't know @@ -58,17 +64,12 @@ its lists to process. You can run to have it try to process these tables generically. -If any files are added, deleted, or their names change, you must run - - mktables -makelist - -to generate a new list of all the files. - FOR PUMPKINS The files are inter-related. If you take the latest UnicodeData.txt, for example, but leave the older versions of other files, there can be subtle -problems. +problems. So get everything available from Unicode, and delete those which +aren't needed. When moving to a new version of Unicode, you need to update 'version' by hand @@ -85,27 +86,19 @@ mktables can continue to be used for earlier Unicode versions. When putting out a new Perl release, think about if any of the Deprecated properties should be moved to Suppressed. -The *.pl files are generated from the *.txt files by the mktables script, -more recently done during the Perl build process, but if you want to try -the old manual way: - - cd lib/unicore - p4 edit *.pl */*.pl */*/*.pl - perl ./mktables -P ../../pod -T ../../t/re/uniprops.t -makelist - p4 revert -a - cd ../.. - perl Porting/manicheck - -If any new (or deleted, unlikely but not impossible) *.pl files are indicated: - - cd lib/unicore - p4 add ... - p4 delete ... - cd ../... - p4 edit MANIFEST - ... +The code in regexec.c for the \X match construct is intimately tied to the +regular expression in UAX #29 (http://www.unicode.org/reports/tr29/). You +should see if it has changed, and if so regexec.c should be modified. The +current one is +( CRLF +| Prepend* ( Hangul-syllable | !Control ) + ( Grapheme_Extend | Spacing_Mark)* +| . ) + +mktables has many checks to warn you if there are unexpected or novel things +that it doesn't know how to handle. -And finally: +Finally: p4 submit diff --git a/lib/unicore/auxiliary/GCBTest.txt b/lib/unicore/auxiliary/GCBTest.txt new file mode 100644 index 0000000..7932e4d --- /dev/null +++ b/lib/unicore/auxiliary/GCBTest.txt @@ -0,0 +1,311 @@ +# GraphemeBreakTest-5.2.0.txt +# Date: 2009-09-19, 00:42:12 GMT [MD] +# +# Unicode Character Database +# Copyright (c) 1991-2009 Unicode, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# For documentation, see http://www.unicode.org/reports/tr44/ +# +# Default Grapheme Break Test +# +# Format: +# (# )? +# contains hex Unicode code points, with +# ÷ wherever there is a break opportunity, and +# × wherever there is not. +# the format can change, but currently it shows: +# - the sample character name +# - (x) the Grapheme_Break property* for the sample character +# - [x] the rule that determines whether there is a break or not +# +# These samples may be extended or changed in the future. +# +÷ 0020 ÷ 0020 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0020 × 0308 ÷ 0020 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0020 ÷ 000D ÷ # ÷ [0.2] SPACE (Other) ÷ [5.0] (CR) ÷ [0.3] +÷ 0020 × 0308 ÷ 000D ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 0020 ÷ 000A ÷ # ÷ [0.2] SPACE (Other) ÷ [5.0] (LF) ÷ [0.3] +÷ 0020 × 0308 ÷ 000A ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 0020 ÷ 0001 ÷ # ÷ [0.2] SPACE (Other) ÷ [5.0] (Control) ÷ [0.3] +÷ 0020 × 0308 ÷ 0001 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 0020 × 0300 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0020 × 0308 × 0300 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0020 ÷ 0E40 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0020 × 0308 ÷ 0E40 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0020 × 0903 ÷ # ÷ [0.2] SPACE (Other) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0020 × 0308 × 0903 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0020 ÷ 1100 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0020 × 0308 ÷ 1100 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0020 ÷ 1160 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0020 × 0308 ÷ 1160 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0020 ÷ 11A8 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0020 × 0308 ÷ 11A8 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0020 ÷ AC00 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0020 × 0308 ÷ AC00 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0020 ÷ AC01 ÷ # ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0020 × 0308 ÷ AC01 ÷ # ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000D ÷ 0020 ÷ # ÷ [0.2] (CR) ÷ [4.0] SPACE (Other) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 000D ÷ 000D ÷ # ÷ [0.2] (CR) ÷ [4.0] (CR) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 000D ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 000D × 000A ÷ # ÷ [0.2] (CR) × [3.0] (LF) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 000A ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 000D ÷ 0001 ÷ # ÷ [0.2] (CR) ÷ [4.0] (Control) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 0001 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 000D ÷ 0300 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000D ÷ 0308 × 0300 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000D ÷ 0E40 ÷ # ÷ [0.2] (CR) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 0E40 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000D ÷ 0903 ÷ # ÷ [0.2] (CR) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000D ÷ 0308 × 0903 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000D ÷ 1100 ÷ # ÷ [0.2] (CR) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 1100 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000D ÷ 1160 ÷ # ÷ [0.2] (CR) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 1160 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000D ÷ 11A8 ÷ # ÷ [0.2] (CR) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000D ÷ 0308 ÷ 11A8 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000D ÷ AC00 ÷ # ÷ [0.2] (CR) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000D ÷ 0308 ÷ AC00 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000D ÷ AC01 ÷ # ÷ [0.2] (CR) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000D ÷ 0308 ÷ AC01 ÷ # ÷ [0.2] (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000A ÷ 0020 ÷ # ÷ [0.2] (LF) ÷ [4.0] SPACE (Other) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 000A ÷ 000D ÷ # ÷ [0.2] (LF) ÷ [4.0] (CR) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 000D ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 000A ÷ 000A ÷ # ÷ [0.2] (LF) ÷ [4.0] (LF) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 000A ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 000A ÷ 0001 ÷ # ÷ [0.2] (LF) ÷ [4.0] (Control) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 0001 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 000A ÷ 0300 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000A ÷ 0308 × 0300 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 000A ÷ 0E40 ÷ # ÷ [0.2] (LF) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 0E40 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 000A ÷ 0903 ÷ # ÷ [0.2] (LF) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000A ÷ 0308 × 0903 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 000A ÷ 1100 ÷ # ÷ [0.2] (LF) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 1100 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 000A ÷ 1160 ÷ # ÷ [0.2] (LF) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 1160 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 000A ÷ 11A8 ÷ # ÷ [0.2] (LF) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000A ÷ 0308 ÷ 11A8 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 000A ÷ AC00 ÷ # ÷ [0.2] (LF) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000A ÷ 0308 ÷ AC00 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 000A ÷ AC01 ÷ # ÷ [0.2] (LF) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 000A ÷ 0308 ÷ AC01 ÷ # ÷ [0.2] (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0001 ÷ 0020 ÷ # ÷ [0.2] (Control) ÷ [4.0] SPACE (Other) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 0020 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0001 ÷ 000D ÷ # ÷ [0.2] (Control) ÷ [4.0] (CR) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 000D ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 0001 ÷ 000A ÷ # ÷ [0.2] (Control) ÷ [4.0] (LF) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 000A ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 0001 ÷ 0001 ÷ # ÷ [0.2] (Control) ÷ [4.0] (Control) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 0001 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 0001 ÷ 0300 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0001 ÷ 0308 × 0300 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0001 ÷ 0E40 ÷ # ÷ [0.2] (Control) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 0E40 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0001 ÷ 0903 ÷ # ÷ [0.2] (Control) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0001 ÷ 0308 × 0903 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0001 ÷ 1100 ÷ # ÷ [0.2] (Control) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 1100 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0001 ÷ 1160 ÷ # ÷ [0.2] (Control) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 1160 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0001 ÷ 11A8 ÷ # ÷ [0.2] (Control) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ 11A8 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0001 ÷ AC00 ÷ # ÷ [0.2] (Control) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ AC00 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0001 ÷ AC01 ÷ # ÷ [0.2] (Control) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0001 ÷ 0308 ÷ AC01 ÷ # ÷ [0.2] (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0300 ÷ 0020 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0300 × 0308 ÷ 0020 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0300 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 0300 × 0308 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 0300 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 0300 × 0308 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 0300 ÷ 0001 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 0300 × 0308 ÷ 0001 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 0300 × 0300 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0300 × 0308 × 0300 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0300 ÷ 0E40 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0300 × 0308 ÷ 0E40 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0300 × 0903 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0300 × 0308 × 0903 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0300 ÷ 1100 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0300 × 0308 ÷ 1100 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0300 ÷ 1160 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0300 × 0308 ÷ 1160 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0300 ÷ 11A8 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0300 × 0308 ÷ 11A8 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0300 ÷ AC00 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0300 × 0308 ÷ AC00 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0300 ÷ AC01 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0300 × 0308 ÷ AC01 ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0E40 × 0020 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] SPACE (Other) ÷ [0.3] +÷ 0E40 × 0308 ÷ 0020 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0E40 ÷ 000D ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] (CR) ÷ [0.3] +÷ 0E40 × 0308 ÷ 000D ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 0E40 ÷ 000A ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] (LF) ÷ [0.3] +÷ 0E40 × 0308 ÷ 000A ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 0E40 ÷ 0001 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] (Control) ÷ [0.3] +÷ 0E40 × 0308 ÷ 0001 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 0E40 × 0300 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0E40 × 0308 × 0300 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0E40 × 0E40 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0E40 × 0308 ÷ 0E40 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0E40 × 0903 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0E40 × 0308 × 0903 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0E40 × 1100 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0E40 × 0308 ÷ 1100 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0E40 × 1160 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0E40 × 0308 ÷ 1160 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0E40 × 11A8 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0E40 × 0308 ÷ 11A8 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0E40 × AC00 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0E40 × 0308 ÷ AC00 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0E40 × AC01 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0E40 × 0308 ÷ AC01 ÷ # ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0903 ÷ 0020 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0903 × 0308 ÷ 0020 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 0903 ÷ 000D ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] (CR) ÷ [0.3] +÷ 0903 × 0308 ÷ 000D ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 0903 ÷ 000A ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] (LF) ÷ [0.3] +÷ 0903 × 0308 ÷ 000A ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 0903 ÷ 0001 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] (Control) ÷ [0.3] +÷ 0903 × 0308 ÷ 0001 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 0903 × 0300 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0903 × 0308 × 0300 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 0903 ÷ 0E40 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0903 × 0308 ÷ 0E40 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 0903 × 0903 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0903 × 0308 × 0903 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 0903 ÷ 1100 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0903 × 0308 ÷ 1100 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 0903 ÷ 1160 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0903 × 0308 ÷ 1160 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 0903 ÷ 11A8 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0903 × 0308 ÷ 11A8 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 0903 ÷ AC00 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0903 × 0308 ÷ AC00 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 0903 ÷ AC01 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 0903 × 0308 ÷ AC01 ÷ # ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1100 ÷ 0020 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1100 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1100 ÷ 000D ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] (CR) ÷ [0.3] +÷ 1100 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 1100 ÷ 000A ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] (LF) ÷ [0.3] +÷ 1100 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 1100 ÷ 0001 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] (Control) ÷ [0.3] +÷ 1100 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 1100 × 0300 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1100 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1100 ÷ 0E40 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1100 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1100 × 0903 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1100 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1100 × 1100 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1100 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1100 × 1160 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1100 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1100 ÷ 11A8 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1100 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1100 × AC00 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1100 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1100 × AC01 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1100 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1160 ÷ 0020 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1160 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 1160 ÷ 000D ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] (CR) ÷ [0.3] +÷ 1160 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 1160 ÷ 000A ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] (LF) ÷ [0.3] +÷ 1160 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 1160 ÷ 0001 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] (Control) ÷ [0.3] +÷ 1160 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 1160 × 0300 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1160 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 1160 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1160 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 1160 × 0903 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1160 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 1160 ÷ 1100 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1160 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 1160 × 1160 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [7.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1160 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 1160 × 11A8 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [7.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1160 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 1160 ÷ AC00 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1160 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 1160 ÷ AC01 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 1160 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 11A8 ÷ 0020 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 11A8 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ 11A8 ÷ 000D ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] (CR) ÷ [0.3] +÷ 11A8 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ 11A8 ÷ 000A ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] (LF) ÷ [0.3] +÷ 11A8 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ 11A8 ÷ 0001 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] (Control) ÷ [0.3] +÷ 11A8 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ 11A8 × 0300 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 11A8 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ 11A8 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 11A8 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ 11A8 × 0903 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 11A8 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ 11A8 ÷ 1100 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 11A8 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ 11A8 ÷ 1160 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 11A8 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ 11A8 × 11A8 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [8.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 11A8 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ 11A8 ÷ AC00 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 11A8 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ 11A8 ÷ AC01 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ 11A8 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC00 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC00 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC00 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] (CR) ÷ [0.3] +÷ AC00 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ AC00 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] (LF) ÷ [0.3] +÷ AC00 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ AC00 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] (Control) ÷ [0.3] +÷ AC00 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ AC00 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC00 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC00 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC00 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC00 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC00 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC00 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC00 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC00 × 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [7.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC00 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC00 × 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [7.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC00 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC00 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC00 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC00 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC00 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC01 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC01 × 0308 ÷ 0020 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3] +÷ AC01 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] (CR) ÷ [0.3] +÷ AC01 × 0308 ÷ 000D ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (CR) ÷ [0.3] +÷ AC01 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] (LF) ÷ [0.3] +÷ AC01 × 0308 ÷ 000A ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (LF) ÷ [0.3] +÷ AC01 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] (Control) ÷ [0.3] +÷ AC01 × 0308 ÷ 0001 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] (Control) ÷ [0.3] +÷ AC01 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC01 × 0308 × 0300 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3] +÷ AC01 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC01 × 0308 ÷ 0E40 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3] +÷ AC01 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC01 × 0308 × 0903 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3] +÷ AC01 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC01 × 0308 ÷ 1100 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3] +÷ AC01 ÷ 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC01 × 0308 ÷ 1160 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3] +÷ AC01 × 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [8.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC01 × 0308 ÷ 11A8 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3] +÷ AC01 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC01 × 0308 ÷ AC00 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3] +÷ AC01 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +÷ AC01 × 0308 ÷ AC01 ÷ # ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3] +# Lines: 288 diff --git a/lib/unicore/mktables b/lib/unicore/mktables index f39466a..c61a3f4 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -998,6 +998,7 @@ my $MULTIPLE = 4; # Don't replace, but add a duplicate record if my $NORMAL = ""; my $SUPPRESSED = 'z'; # The character should never actually be seen, since # it is suppressed +my $PLACEHOLDER = 'P'; # Implies no pod entry generated my $DEPRECATED = 'D'; my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; @@ -1019,7 +1020,7 @@ my %status_past_participles = ( $SUPPRESSED => 'should never be generated', $STABILIZED => 'stabilized', $OBSOLETE => 'obsolete', - $DEPRECATED => 'deprecated' + $DEPRECATED => 'deprecated', ); # The format of the values of the map tables: @@ -1074,6 +1075,7 @@ my %Jamo_L; # Leading consonants my %Jamo_V; # Vowels my %Jamo_T; # Trailing consonants +my @backslash_X_tests; # List of tests read in for testing \X my @unhandled_properties; # Will contain a list of properties found in # the input that we didn't process. my @match_properties; # Properties that have match tables, to be @@ -1604,6 +1606,15 @@ sub trace { return main::trace(@_); } # processed when you set the $debug_skip global. main::set_access('non_skip', \%non_skip, 'c'); + my %skip; + # This is used to skip processing of this input file semi-permanently. + # It is used for files that we aren't planning to process anytime soon, + # but want to allow to be in the directory and not raise a message that we + # are not handling. Mostly for test files. This is in contrast to the + # non_skip element, which is supposed to be used very temporarily for + # debugging. Sets 'optional' to 1 + main::set_access('skip', \%skip, 'c'); + my %each_line_handler; # list of subroutines to look at and filter each non-comment line in the # file. defaults to none. The subroutines are called in order, each is @@ -1667,6 +1678,7 @@ sub trace { return main::trace(@_); } # Set defaults $handler{$addr} = \&main::process_generic_property_file; $non_skip{$addr} = 0; + $skip{$addr} = 0; $has_missings_defaults{$addr} = $NO_DEFAULTS; $handle{$addr} = undef; $added_lines{$addr} = [ ]; @@ -1723,6 +1735,8 @@ sub trace { return main::trace(@_); } print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; } + $optional{$addr} = 1 if $skip{$addr}; + return $self; } @@ -1771,7 +1785,7 @@ sub trace { return main::trace(@_); } } # File could be optional - if ($optional{$addr}){ + if ($optional{$addr}) { return unless -e $file; my $result = eval $optional{$addr}; if (! defined $result) { @@ -1804,7 +1818,8 @@ sub trace { return main::trace(@_); } } else { - # Here, the file exists + # Here, the file exists. Some platforms may change the case of + # its name if ($seen_non_extracted_non_age) { if ($file =~ /$EXTRACTED/i) { Carp::my_carp_bug(join_lines(<= $VERBOSE; + return; + } + # Open the file, converting the slashes used in this program # into the proper form for the OS my $file_handle; @@ -3846,14 +3868,12 @@ sub trace { return main::trace(@_); } # For non-ASCII, we shun the characters that don't have Perl encoding- # independent symbols for them. 'A' is such a symbol, so is "\n". - # Note, this program hopefully will work on 5.8 Perls, and \v is not - # such a symbol in them. return $try_hard if $non_ASCII && $code <= 0xFF && ($code >= 0x7F || ($code >= 0x0E && $code <= 0x1F) || ($code >= 0x01 && $code <= 0x06) - || $code == 0x0B); # \v introduced after 5.8 + || $code == 0x0B); # shun null. I'm (khw) not sure why this was done, but NULL would be # the character very frequently used. @@ -4075,7 +4095,6 @@ sub trace { return main::trace(@_); } my $complete_name = $complete_name{$addr} = delete $args{'Complete_Name'}; $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0; - $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0; $property{$addr} = delete $args{'_Property'}; $range_list{$addr} = delete $args{'_Range_List'}; $status{$addr} = delete $args{'Status'} || $NORMAL; @@ -4087,6 +4106,7 @@ sub trace { return main::trace(@_); } my $loose_match = delete $args{'Fuzzy'}; my $note = delete $args{'Note'}; my $make_pod_entry = delete $args{'Pod_Entry'}; + my $perl_extension = delete $args{'Perl_Extension'}; # Shouldn't have any left over Carp::carp_extra_args(\%args) if main::DEBUG && %args; @@ -4105,11 +4125,20 @@ sub trace { return main::trace(@_); } push @{$description{$addr}}, $description if $description; push @{$note{$addr}}, $note if $note; - # If hasn't set its status already, see if it is on one of the lists - # of properties or tables that have particular statuses; if not, is - # normal. The lists are prioritized so the most serious ones are - # checked first - if (! $status{$addr}) { + if ($status{$addr} eq $PLACEHOLDER) { + + # A placeholder table doesn't get documented, is a perl extension, + # and quite likely will be empty + $make_pod_entry = 0 if ! defined $make_pod_entry; + $perl_extension = 1 if ! defined $perl_extension; + push @tables_that_may_be_empty, $complete_name{$addr}; + } + elsif (! $status{$addr}) { + + # If hasn't set its status already, see if it is on one of the + # lists of properties or tables that have particular statuses; if + # not, is normal. The lists are prioritized so the most serious + # ones are checked first if (exists $why_suppressed{$complete_name}) { $status{$addr} = $SUPPRESSED; } @@ -4145,6 +4174,8 @@ sub trace { return main::trace(@_); } } } + $perl_extension{$addr} = $perl_extension || 0; + # By convention what typically gets printed only or first is what's # first in the list, so put the full name there for good output # clarity. Other routines rely on the full name being first on the @@ -6204,7 +6235,17 @@ END my $flag = $property->status || $table->status || $table_alias_object->status; - $flags{$flag} = $status_past_participles{$flag} if $flag; + if ($flag) { + if ($flag ne $PLACEHOLDER) { + $flags{$flag} = $status_past_participles{$flag}; + } else { + $flags{$flag} = <note; push @conflicting, $table->conflicting; + # And this for output after all the tables. + push @global_comments, $table->comment; + # Compute an alternate compound name using the final property # synonym and the first table synonym with a colon instead of # the equal sign used elsewhere. @@ -6306,8 +6350,10 @@ END if (%flags) { foreach my $flag (sort keys %flags) { $comment .= <next_line) { + push @backslash_X_tests, $_; + } + + return; +} + sub process_NamedSequences { # NamedSequences.txt entries are just added to an array. Because these # don't look like the other tables, they have their own handler. @@ -10795,21 +10855,78 @@ sub compile_perl() { } # These are used in Unicode's definition of \X + my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1); + my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1); + my $gcb = property_ref('Grapheme_Cluster_Break'); - #my $extend = $perl->add_match_table('_X_Extend'); - my $extend = $perl->add_match_table('_GCB_Extend'); - # XXX until decide what todo my $begin = $perl->add_match_table('_X_Begin'); - if (defined $gcb) { - $extend += $gcb->table('Extend') + $gcb->table('SpacingMark') - #$begin += ~ ($gcb->table('Control') - # + $gcb->table('CR') - # + $gcb->table('LF')); + + # The 'extended' grapheme cluster came in 5.1. The non-extended + # definition differs too much from the traditional Perl one to use. + if (defined $gcb && defined $gcb->table('SpacingMark')) { + + # Note that assumes HST is defined; it came in an earlier release than + # GCB. In the line below, two negatives means: yes hangul + $begin += ~ property_ref('Hangul_Syllable_Type') + ->table('Not_Applicable') + + ~ ($gcb->table('Control') + + $gcb->table('CR') + + $gcb->table('LF')); + $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control'); + + $extend += $gcb->table('Extend') + $gcb->table('SpacingMark'); + $extend->add_comment('For use in \X; matches: Extend | SpacingMark'); } else { # Old definition, used on early releases. $extend += $gc->table('Mark') - + 0x200C # ZWNJ - + 0x200D; # ZWJ - #$begin += ~ $extend; + + 0x200C # ZWNJ + + 0x200D; # ZWJ + $begin += ~ $extend; + + # Here we may have a release that has the regular grapheme cluster + # defined, or a release that doesn't have anything defined. + # We set things up so the Perl core degrades gracefully, possibly with + # placeholders that match nothing. + + if (! defined $gcb) { + $gcb = Property->new('GCB', Status => $PLACEHOLDER); + } + my $hst = property_ref('HST'); + if (!defined $hst) { + $hst = Property->new('HST', Status => $PLACEHOLDER); + $hst->add_match_table('Not_Applicable', + Initialize => $Any, + Matches_All => 1); + } + + # On some releases, here we may not have the needed tables for the + # perl core, in some releases we may. + foreach my $name (qw{ L LV LVT T V prepend }) { + my $table = $gcb->table($name); + if (! defined $table) { + $table = $gcb->add_match_table($name); + push @tables_that_may_be_empty, $table->complete_name; + } + + # The HST property predates the GCB one, and has identical tables + # for some of them, so use it if we can. + if ($table->is_empty + && defined $hst + && defined $hst->table($name)) + { + $table += $hst->table($name); + } + } + } + + # More GCB. If we found some hangul syllables, populate a combined + # table. + my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V'); + my $LV = $gcb->table('LV'); + if ($LV->is_empty) { + push @tables_that_may_be_empty, $lv_lvt_v->complete_name; + } else { + $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V'); + $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V'); } # Create a new property specially located that is a combination of the @@ -13231,6 +13348,11 @@ sub make_property_test_script() { } } } + + foreach my $test (@backslash_X_tests) { + print $OUT "Test_X('$test');\n"; + } + print $OUT "Finished();\n"; close $OUT; return; @@ -13380,6 +13502,9 @@ my @input_file_objects = ( Input_file->new('BidiMirroring.txt', v3.0.1, Property => 'Bidi_Mirroring_Glyph', ), + Input_file->new("NormalizationTest.txt", v3.0.1, + Skip => 1, + ), Input_file->new('CaseFolding.txt', v3.0.1, Pre_Handler => \&setup_case_folding, Each_Line_Handler => @@ -13417,6 +13542,18 @@ my @input_file_objects = ( Property => 'Grapheme_Cluster_Break', Has_Missings_Defaults => $NOT_IGNORED, ), + Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, + Handler => \&process_GCB_test, + ), + Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, + Skip => 1, + ), Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, Property => 'Sentence_Break', Has_Missings_Defaults => $NOT_IGNORED, @@ -13427,6 +13564,9 @@ my @input_file_objects = ( Input_file->new('NameAliases.txt', v5.0.0, Property => 'Name_Alias', ), + Input_file->new("BidiTest.txt", v5.2.0, + Skip => 1, + ), Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, Optional => 1, Each_Line_Handler => \&filter_unihan_line, @@ -13474,18 +13614,16 @@ END # Put into %potential_files a list of all the files in the directory structure # that could be inputs to this program, excluding those that we should ignore. -# Also don't consider test files. Use absolute file names because it makes it -# easier across machine types. +# Use absolute file names because it makes it easier across machine types. my @ignored_files_full_names = map { File::Spec->rel2abs( internal_file_to_platform($_)) } keys %ignored_files; File::Find::find({ wanted=>sub { - return unless /\.txt$/i; - return if /Test\.txt$/i; + return unless /\.txt$/i; # Some platforms change the name's case my $full = lc(File::Spec->rel2abs($_)); $potential_files{$full} = 1 - if ! grep { $full eq lc($_) } @ignored_files_full_names; + if ! grep { $full eq lc($_) } @ignored_files_full_names; return; } }, File::Spec->curdir()); @@ -13584,7 +13722,7 @@ if ($glob_list) { && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) { splice @input_file_objects, $i, 0, - Input_file->new($file, v0); + Input_file->new($file, v0); last; } } @@ -13758,28 +13896,53 @@ __DATA__ use strict; use warnings; -# Test the \p{} regular expression constructs. This file is constructed by -# mktables from the tables it generates, so if mktables is buggy, this won't -# necessarily catch those bugs. Tests are generated for all feasible -# properties; a few aren't currently feasible; see is_code_point_usable() -# in mktables for details. +# Test qr/\X/ and the \p{} regular expression constructs. This file is +# constructed by mktables from the tables it generates, so if mktables is +# buggy, this won't necessarily catch those bugs. Tests are generated for all +# feasible properties; a few aren't currently feasible; see +# is_code_point_usable() in mktables for details. # Standard test packages are not used because this manipulates SIG_WARN. It # exits 0 if every non-skipped test succeeded; -1 if any failed. my $Tests = 0; my $Fails = 0; -my $Skips = 0; my $non_ASCII = (ord('A') != 65); -# The first 127 ASCII characters in ordinal order, with the ones that don't -# have Perl names (as of 5.8) replaced by dots. The 127th is used as the -# string delimiter -my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; -#for my $i (0..126) { -# print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n"; -#} +# The 256 8-bit characters in ASCII ordinal order, with the ones that don't +# have Perl names replaced by -1 +my @ascii_ordered_chars = ( + "\0", + (-1) x 6, + "\a", "\b", "\t", "\n", + -1, # No Vt + "\f", "\r", + (-1) x 18, + " ", "!", "\"", "#", '$', "%", "&", "'", + "(", ")", "*", "+", ",", "-", ".", "/", + "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + ":", ";", "<", "=", ">", "?", "@", + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", + "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", + "[", "\\", "]", "^", "_", "`", + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", + "{", "|", "}", "~", + (-1) x 129 +); + +sub ASCII_ord_to_native ($) { + # Converts input ordinal number to the native one, if can be done easily. + # Returns -1 otherwise. + + my $ord = shift; + + return $ord if $ord > 255 || ! $non_ASCII; + my $result = $ascii_ordered_chars[$ord]; + return $result if $result eq '-1'; + return ord($result); +} sub Expect($$$$) { my $expected = shift; @@ -13789,38 +13952,24 @@ sub Expect($$$$) { # or empty if none my $line = (caller)[2]; + # Convert the non-ASCII code points expressible as characters to their + # ASCII equivalents, and skip the others. + $ord = ASCII_ord_to_native($ord); + if ($ord < 0) { + $Tests++; + print "ok $Tests - " + . sprintf("\"\\x{%04X}\"", $ord) + . " =~ $regex # Skipped: non-ASCII\n"; + return; + } + # Convert the code point to hex form my $string = sprintf "\"\\x{%04X}\"", $ord; - # Convert the non-ASCII code points expressible as characters in Perl 5.8 - # to their ASCII equivalents, and skip the others. - if ($non_ASCII && $ord < 255) { - - # Dots are used as place holders in the conversion string for the - # non-convertible ones, so check for it first. - if ($ord == 0x2E) { - $ord = ord('.'); - } - elsif ($ord < 0x7F - # Any dots returned are non-convertible. - && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.')) - { - #print STDERR "$ord, $char, \n"; - $ord = ord($char); - } - else { - $Tests++; - $Skips++; - print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n"; - return; - } - } - - # The first time through, use all warnings. my @tests = ""; - # If the input should generate a warning, add another time through with - # them turned off + # The first time through, use all warnings. If the input should generate + # a warning, add another time through with them turned off push @tests, "no warnings '$warning_type';" if $warning_type; foreach my $no_warnings (@tests) { @@ -13880,9 +14029,142 @@ sub Error($) { return; } +# GCBTest.txt character that separates grapheme clusters +my $breakable_utf8 = my $breakable = chr(0xF7); +utf8::upgrade($breakable_utf8); + +# GCBTest.txt character that indicates that the adjoining code points are part +# of the same grapheme cluster +my $nobreak_utf8 = my $nobreak = chr(0xD7); +utf8::upgrade($nobreak_utf8); + +sub Test_X($) { + # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt + # Each such line is a sequence of code points given by their hex numbers, + # separated by the two characters defined just before this subroutine that + # indicate that either there can or cannot be a break between the adjacent + # code points. If there isn't a break, that means the sequence forms an + # extended grapheme cluster, which means that \X should match the whole + # thing. If there is a break, \X should stop there. This is all + # converted by this routine into a match: + # $string =~ /(\X)/, + # Each \X should match the next cluster; and that is what is checked. + + my $template = shift; + + my $line = (caller)[2]; + + # The line contains characters above the ASCII range, but in Latin1. It + # may or may not be in utf8, and if it is, it may or may not know it. So, + # convert these characters to 8 bits. If knows is in utf8, simply + # downgrade. + if (utf8::is_utf8($template)) { + utf8::downgrade($template); + } else { + + # Otherwise, if it is in utf8, but doesn't know it, the next lines + # convert the two problematic characters to their 8-bit equivalents. + # If it isn't in utf8, they don't harm anything. + use bytes; + $template =~ s/$nobreak_utf8/$nobreak/g; + $template =~ s/$breakable_utf8/$breakable/g; + } + + # Get rid of the leading and trailing breakables + $template =~ s/^ \s* $breakable \s* //x; + $template =~ s/ \s* $breakable \s* $ //x; + + # And no-breaks become just a space. + $template =~ s/ \s* $nobreak \s* / /xg; + + # Split the input into segments that are breakable between them. + my @segments = split /\s*$breakable\s*/, $template; + + my $string = ""; + my $display_string = ""; + my @should_match; + my @should_display; + + # Convert the code point sequence in each segment into a Perl string of + # characters + foreach my $segment (@segments) { + my @code_points = split /\s+/, $segment; + my $this_string = ""; + my $this_display = ""; + foreach my $code_point (@code_points) { + my $ord = ASCII_ord_to_native(hex $code_point); + if ($ord < 0) { + $Tests++; + print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n"; + return; + } + $this_string .= chr $ord; + $this_display .= "\\x{$code_point}"; + } + + # The next cluster should match the string in this segment. + push @should_match, $this_string; + push @should_display, $this_display; + $string .= $this_string; + $display_string .= $this_display; + } + + # If a string can be represented in both non-ut8 and utf8, test both cases + UPGRADE: + for my $to_upgrade (0 .. 1) { + + if ($to_upgrade) { + + # If already in utf8, would just be a repeat + next UPGRADE if utf8::is_utf8($string); + + utf8::upgrade($string); + } + + # Finally, do the \X match. + my @matches = $string =~ /(\X)/g; + + # Look through each matched cluster to verify that it matches what we + # expect. + my $min = (@matches < @should_match) ? @matches : @should_match; + for my $i (0 .. $min - 1) { + $Tests++; + if ($matches[$i] eq $should_match[$i]) { + print "ok $Tests - "; + if ($i == 0) { + print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; + } else { + print "And \\X #", $i + 1, + } + print " correctly matched $should_display[$i]; line $line\n"; + } else { + $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ } + unpack("U*", $matches[$i])); + print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #", + $i + 1, + " should have matched $should_display[$i]", + " but instead matched $matches[$i]", + ". Abandoning rest of line $line\n"; + next UPGRADE; + } + } + + # And the number of matches should equal the number of expected matches. + $Tests++; + if (@matches == @should_match) { + print "ok $Tests - Nothing was left over; line $line\n"; + } else { + print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n"; + } + } + + return; +} + sub Finished() { print "1..$Tests\n"; exit($Fails ? -1 : 0); } Error('\p{Script=InGreek}'); # Bug #69018 +Test_X("1100 $nobreak 1161"); # Bug #70940 diff --git a/perlapi.h b/perlapi.h index 5c2df74..54ddab0 100644 --- a/perlapi.h +++ b/perlapi.h @@ -684,6 +684,26 @@ END_EXTERN_C #define PL_unlockhook (*Perl_Iunlockhook_ptr(aTHX)) #undef PL_unsafe #define PL_unsafe (*Perl_Iunsafe_ptr(aTHX)) +#undef PL_utf8_X_L +#define PL_utf8_X_L (*Perl_Iutf8_X_L_ptr(aTHX)) +#undef PL_utf8_X_LV +#define PL_utf8_X_LV (*Perl_Iutf8_X_LV_ptr(aTHX)) +#undef PL_utf8_X_LVT +#define PL_utf8_X_LVT (*Perl_Iutf8_X_LVT_ptr(aTHX)) +#undef PL_utf8_X_LV_LVT_V +#define PL_utf8_X_LV_LVT_V (*Perl_Iutf8_X_LV_LVT_V_ptr(aTHX)) +#undef PL_utf8_X_T +#define PL_utf8_X_T (*Perl_Iutf8_X_T_ptr(aTHX)) +#undef PL_utf8_X_V +#define PL_utf8_X_V (*Perl_Iutf8_X_V_ptr(aTHX)) +#undef PL_utf8_X_begin +#define PL_utf8_X_begin (*Perl_Iutf8_X_begin_ptr(aTHX)) +#undef PL_utf8_X_extend +#define PL_utf8_X_extend (*Perl_Iutf8_X_extend_ptr(aTHX)) +#undef PL_utf8_X_non_hangul +#define PL_utf8_X_non_hangul (*Perl_Iutf8_X_non_hangul_ptr(aTHX)) +#undef PL_utf8_X_prepend +#define PL_utf8_X_prepend (*Perl_Iutf8_X_prepend_ptr(aTHX)) #undef PL_utf8_alnum #define PL_utf8_alnum (*Perl_Iutf8_alnum_ptr(aTHX)) #undef PL_utf8_alpha diff --git a/pod/perl5113delta.pod b/pod/perl5113delta.pod index 2e1ddf8..ec2443c 100644 --- a/pod/perl5113delta.pod +++ b/pod/perl5113delta.pod @@ -38,6 +38,12 @@ Perl is shipped with the latest Unicode version, 5.2, October 2009. See L for details about this release of Unicode. +But, an installation can now fairly easily change Perl to operate on any +Unicode release. Perl is shipped with the latest official release, but +an installation can download and install any prior release from Unicode, and +cause Perl to work with that (or even multiple releases). Instructions are in +L. + =head2 Unicode properties Perl can now handle every Unicode character property. A new pod, @@ -58,6 +64,15 @@ underscores between digits of numbers. All the Unicode-defined synonyms for properties and property values are now accepted. +C, which matches a Unicode logical character, has been expanded to work +better with various Asian languages. It now is defined as an C. (See L). One change +due to this is that C<\X> will match the whole sequence C>. Another +change is that C<\X> will match an isolated mark. Marks generally come after a +base character, but it is possible in Unicode to have them in isolation, and +C<\X> will now handle that case. Otherwise, this change should be transparent +for the non-affected languages. + C<\p{...}> matches using the Canonical_Combining_Class property were completely broken in previous Perls. This is now fixed. @@ -120,11 +135,6 @@ Other_Default_Ignorable_Code_Point, Other_Grapheme_Extend, Other_ID_Continue, Other_ID_Start, Other_Lowercase, Other_Math, and Other_Uppercase. -An installation can now fairly easily change Perl to operate on any -Unicode release. Perl is shipped with the latest official release, but -an installation can now download any prior release, and Perl will work -with that. Instructions are in L. - An installation can now fairly easily change which Unicode properties Perl understands. As mentioned above, certain properties are by default turned off. These include all the Unihan properties (which should be diff --git a/proto.h b/proto.h index eab3e82..02fdd2d 100644 --- a/proto.h +++ b/proto.h @@ -1414,6 +1414,66 @@ PERL_CALLCONV bool Perl_is_utf8_mark(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_MARK \ assert(p) +PERL_CALLCONV bool Perl_is_utf8_X_begin(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_extend(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_prepend(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_L(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_L \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_LV(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_LV \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_LVT(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_LVT \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_T(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_T \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_X_V(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_X_V \ + assert(p) + PERL_CALLCONV OP* Perl_jmaybe(pTHX_ OP *o) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_JMAYBE \ diff --git a/regexec.c b/regexec.c index 06fe13a..b01a99b 100644 --- a/regexec.c +++ b/regexec.c @@ -120,11 +120,31 @@ /* these are unrolled below in the CCC_TRY_XXX defined */ #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END + +/* Doesn't do an assert to verify that is correct */ +#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ + if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END + #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") -#define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86") +#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ + LOAD_UTF8_CHARCLASS(X_begin, " "), \ + LOAD_UTF8_CHARCLASS(X_non_hangul, "A"), \ + /* These are utf8 constants, and not utf-ebcdic constants, so the \ + * assert should likely and hopefully fail on an EBCDIC machine */ \ + LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"), /* U+0300 */ \ + \ + /* No asserts are done for these, in case called on an early \ + * Unicode version in which they map to nothing */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend),/* U+0E40 "\xe0\xb9\x80" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_L), /* U+1100 "\xe1\x84\x80" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV), /* U+AC00 "\xea\xb0\x80" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT), /* U+AC01 "\xea\xb0\x81" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V),/* U+AC01 "\xea\xb0\x81" */\ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_T), /* U+11A8 "\xe1\x86\xa8" */ \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ /* We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test @@ -3521,22 +3541,216 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); - case CLUMP: + case CLUMP: /* Match \X: logical Unicode character. This is defined as + a Unicode extended Grapheme Cluster */ + /* From http://www.unicode.org/reports/tr29 (5.2 version). An + extended Grapheme Cluster is: + + CR LF + | Prepend* Begin Extend* + | . + + Begin is (Hangul-syllable | ! Control) + Extend is (Grapheme_Extend | Spacing_Mark) + Control is [ GCB_Control CR LF ] + + The discussion below shows how the code for CLUMP is derived + from this regex. Note that most of these concepts are from + property values of the Grapheme Cluster Boundary (GCB) property. + No code point can have multiple property values for a given + property. Thus a code point in Prepend can't be in Control, but + it must be in !Control. This is why Control above includes + GCB_Control plus CR plus LF. The latter two are used in the GCB + property separately, and so can't be in GCB_Control, even though + they logically are controls. Control is not the same as gc=cc, + but includes format and other characters as well. + + The Unicode definition of Hangul-syllable is: + L+ + | (L* ( ( V | LV ) V* | LVT ) T*) + | T+ + ) + Each of these is a value for the GCB property, and hence must be + disjoint, so the order they are tested is immaterial, so the + above can safely be changed to + T+ + | L+ + | (L* ( LVT | ( V | LV ) V*) T*) + + The last two terms can be combined like this: + L* ( L + | (( LVT | ( V | LV ) V*) T*)) + + And refactored into this: + L* (L | LVT T* | V V* T* | LV V* T*) + + That means that if we have seen any L's at all we can quit + there, but if the next character is a LVT, a V or and LV we + should keep going. + + There is a subtlety with Prepend* which showed up in testing. + Note that the Begin, and only the Begin is required in: + | Prepend* Begin Extend* + Also, Begin contains '! Control'. A Prepend must be a '! + Control', which means it must be a Begin. What it comes down to + is that if we match Prepend* and then find no suitable Begin + afterwards, that if we backtrack the last Prepend, that one will + be a suitable Begin. + */ + if (locinput >= PL_regeol) sayNO; - if (do_utf8) { - LOAD_UTF8_CHARCLASS_MARK(); - if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) - sayNO; - locinput += PL_utf8skip[nextchr]; - while (locinput < PL_regeol && - swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8)) - locinput += UTF8SKIP(locinput); - if (locinput > PL_regeol) - sayNO; - } - else - locinput++; + if (! do_utf8) { + + /* Match either CR LF or '.', as all the other possibilities + * require utf8 */ + locinput++; /* Match the . or CR */ + if (nextchr == '\r' + && locinput < PL_regeol + && UCHARAT(locinput) == '\n') locinput++; + } + else { + + /* Utf8: See if is ( CR LF ); already know that locinput < + * PL_regeol, so locinput+1 is in bounds */ + if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') { + locinput += 2; + } + else { + /* In case have to backtrack to beginning, then match '.' */ + char *starting = locinput; + + /* In case have to backtrack the last prepend */ + char *previous_prepend = 0; + + LOAD_UTF8_CHARCLASS_GCB(); + + /* Match (prepend)* */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_prepend, + (U8*)locinput, do_utf8)) + { + previous_prepend = locinput; + locinput += UTF8SKIP(locinput); + } + + /* As noted above, if we matched a prepend character, but + * the next thing won't match, back off the last prepend we + * matched, as it is guaranteed to match the begin */ + if (previous_prepend + && (locinput >= PL_regeol + || ! swash_fetch(PL_utf8_X_begin, + (U8*)locinput, do_utf8))) + { + locinput = previous_prepend; + } + + /* Note that here we know PL_regeol > locinput, as we + * tested that upon input to this switch case, and if we + * moved locinput forward, we tested the result just above + * and it either passed, or we backed off so that it will + * now pass */ + if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) { + + /* Here did not match the required 'Begin' in the + * second term. So just match the very first + * character, the '.' of the final term of the regex */ + locinput = starting + UTF8SKIP(starting); + } else { + + /* Here is the beginning of a character that can have + * an extender. It is either a hangul syllable, or a + * non-control */ + if (swash_fetch(PL_utf8_X_non_hangul, + (U8*)locinput, do_utf8)) + { + + /* Here not a Hangul syllable, must be a + * ('! * Control') */ + locinput += UTF8SKIP(locinput); + } else { + + /* Here is a Hangul syllable. It can be composed + * of several individual characters. One + * possibility is T+ */ + if (swash_fetch(PL_utf8_X_T, + (U8*)locinput, do_utf8)) + { + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_T, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } else { + + /* Here, not T+, but is a Hangul. That means + * it is one of the others: L, LV, LVT or V, + * and matches: + * L* (L | LVT T* | V V* T* | LV V* T*) */ + + /* Match L* */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_L, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + + /* Here, have exhausted L*. If the next + * character is not an LV, LVT nor V, it means + * we had to have at least one L, so matches L+ + * in the original equation, we have a complete + * hangul syllable. Are done. */ + + if (locinput < PL_regeol + && swash_fetch(PL_utf8_X_LV_LVT_V, + (U8*)locinput, do_utf8)) + { + + /* Otherwise keep going. Must be LV, LVT + * or V. See if LVT */ + if (swash_fetch(PL_utf8_X_LVT, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } else { + + /* Must be V or LV. Take it, then + * match V* */ + locinput += UTF8SKIP(locinput); + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_V, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } + + /* And any of LV, LVT, or V can be followed + * by T* */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_T, + (U8*)locinput, + do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } + } + } + + /* Match any extender */ + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_extend, + (U8*)locinput, do_utf8)) + { + locinput += UTF8SKIP(locinput); + } + } + } + if (locinput > PL_regeol) sayNO; + } nextchr = UCHARAT(locinput); break; diff --git a/sv.c b/sv.c index 95ad106..ecd4866 100644 --- a/sv.c +++ b/sv.c @@ -12187,6 +12187,16 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_punct = sv_dup_inc(proto_perl->Iutf8_punct, param); PL_utf8_xdigit = sv_dup_inc(proto_perl->Iutf8_xdigit, param); PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param); + PL_utf8_X_begin = sv_dup_inc(proto_perl->Iutf8_X_begin, param); + PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param); + PL_utf8_X_prepend = sv_dup_inc(proto_perl->Iutf8_X_prepend, param); + PL_utf8_X_non_hangul = sv_dup_inc(proto_perl->Iutf8_X_non_hangul, param); + PL_utf8_X_L = sv_dup_inc(proto_perl->Iutf8_X_L, param); + PL_utf8_X_LV = sv_dup_inc(proto_perl->Iutf8_X_LV, param); + PL_utf8_X_LVT = sv_dup_inc(proto_perl->Iutf8_X_LVT, param); + PL_utf8_X_T = sv_dup_inc(proto_perl->Iutf8_X_T, param); + PL_utf8_X_V = sv_dup_inc(proto_perl->Iutf8_X_V, param); + PL_utf8_X_LV_LVT_V = sv_dup_inc(proto_perl->Iutf8_X_LV_LVT_V, param); PL_utf8_toupper = sv_dup_inc(proto_perl->Iutf8_toupper, param); PL_utf8_totitle = sv_dup_inc(proto_perl->Iutf8_totitle, param); PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param); diff --git a/utf8.c b/utf8.c index c504891..5f3c990 100644 --- a/utf8.c +++ b/utf8.c @@ -1488,6 +1488,106 @@ Perl_is_utf8_mark(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_mark, "IsM"); } +bool +Perl_is_utf8_X_begin(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN; + + return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin"); +} + +bool +Perl_is_utf8_X_extend(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND; + + return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend"); +} + +bool +Perl_is_utf8_X_prepend(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND; + + return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend"); +} + +bool +Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL; + + return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable"); +} + +bool +Perl_is_utf8_X_L(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_L; + + return is_utf8_common(p, &PL_utf8_X_L, "GCB=L"); +} + +bool +Perl_is_utf8_X_LV(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LV; + + return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV"); +} + +bool +Perl_is_utf8_X_LVT(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LVT; + + return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT"); +} + +bool +Perl_is_utf8_X_T(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_T; + + return is_utf8_common(p, &PL_utf8_X_T, "GCB=T"); +} + +bool +Perl_is_utf8_X_V(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_V; + + return is_utf8_common(p, &PL_utf8_X_V, "GCB=V"); +} + +bool +Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V; + + return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V"); +} + /* =for apidoc to_utf8_case @@ -1532,6 +1632,22 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, if (!*swashp) /* load on-demand */ *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); + /* This is the beginnings of a skeleton of code to read the info section + * that is in all the swashes in case we ever want to do that, so one can + * read things whose maps aren't code points, and whose default if missing + * is not to the code point itself. This was just to see if it actually + * worked. Details on what the possibilities are are in perluniprops.pod + HV * const hv = get_hv("utf8::SwashInfo", 0); + if (hv) { + SV **svp; + svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE); + const char *s; + + HV * const this_hash = SvRV(*svp); + svp = hv_fetch(this_hash, "type", strlen("type"), FALSE); + s = SvPV_const(*svp, len); + } + }*/ /* The 0xDF is the only special casing Unicode code point below 0x100. */ if (special && (uv1 == 0xDF || uv1 > 0xFF)) { @@ -1594,7 +1710,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } } - if (!len) /* Neither: just copy. */ + if (!len) /* Neither: just copy. In other words, there was no mapping + defined, which means that the code point maps to itself */ len = uvchr_to_utf8(ustrp, uv0) - ustrp; if (lenp) @@ -1809,7 +1926,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) ptr = tmputf8; } /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ - * then the "swatch" is a vec() for al the chars which start + * then the "swatch" is a vec() for all the chars which start * with 0xAA..0xYY * So the key in the hash (klen) is length of encoded char -1 */ @@ -1817,7 +1934,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) off = ptr[klen]; if (klen == 0) { - /* If char in invariant then swatch is for all the invariant chars + /* If char is invariant then swatch is for all the invariant chars * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK */ needents = UTF_CONTINUATION_MARK; diff --git a/utf8.h b/utf8.h index 19f2174..8fef274 100644 --- a/utf8.h +++ b/utf8.h @@ -73,21 +73,20 @@ END_EXTERN_C U+0000..U+007F 00..7F U+0080..U+07FF C2..DF 80..BF - U+0800..U+0FFF E0 A0..BF 80..BF + U+0800..U+0FFF E0 * A0..BF 80..BF U+1000..U+CFFF E1..EC 80..BF 80..BF - U+D000..U+D7FF ED 80..9F 80..BF - U+D800..U+DFFF ******* ill-formed ******* + U+D000..U+D7FF ED * 80..9F 80..BF + U+D800..U+DFFF +++++++ utf16 surrogates, not legal utf8 +++++++ U+E000..U+FFFF EE..EF 80..BF 80..BF - U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF U+100000..U+10FFFF F4 80..8F 80..BF 80..BF -Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF, -the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF. -The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings: -it is technically possible to UTF-8-encode a single code point in different -ways, but that is explicitly forbidden, and the shortest possible encoding -should always be used (and that is what Perl does). +Note the gaps before the 2nd Byte entries above marked by '*'. These are +caused by legal UTF-8 avoiding non-shortest encodings: it is technically +possible to UTF-8-encode a single code point in different ways, but that is +explicitly forbidden, and the shortest possible encoding should always be used +(and that is what Perl does). */ @@ -213,11 +212,12 @@ encoded character. #define UTF8_ALLOW_EMPTY 0x0001 #define UTF8_ALLOW_CONTINUATION 0x0002 #define UTF8_ALLOW_NON_CONTINUATION 0x0004 -#define UTF8_ALLOW_FE_FF 0x0008 /* Allow above 0x7fffFFFF */ -#define UTF8_ALLOW_SHORT 0x0010 +#define UTF8_ALLOW_FE_FF 0x0008 /* Allow FE or FF start bytes, \ + yields above 0x7fffFFFF */ +#define UTF8_ALLOW_SHORT 0x0010 /* expecting more bytes */ #define UTF8_ALLOW_SURROGATE 0x0020 #define UTF8_ALLOW_FFFF 0x0040 /* Allow UNICODE_ILLEGAL */ -#define UTF8_ALLOW_LONG 0x0080 +#define UTF8_ALLOW_LONG 0x0080 /* expecting fewer bytes */ #define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\ UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) #define UTF8_ALLOW_ANY 0x00FF