Merge commit 'khwilliamson/x' into blead

diff --git a/MANIFEST b/MANIFEST

index 43e9a49..cb599fd 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -3593,6 +3593,7 @@ lib/Unicode/README                Explanation what happened to lib/unicode.
 lib/Unicode/UCD.pm             Unicode character database
 lib/Unicode/UCD.t              See if Unicode character database works
 lib/unicore/ArabicShaping.txt                  Unicode character database
+lib/unicore/auxiliary/GCBTest.txt      Unicode character database
 lib/unicore/auxiliary/GraphemeBreakProperty.txt        Unicode character database
 lib/unicore/auxiliary/SentenceBreakProperty.txt        Unicode character database
 lib/unicore/auxiliary/WordBreakProperty.txt    Unicode character database
diff --git a/embed.fnc b/embed.fnc

index 575c4df..6b24d1e 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -502,6 +502,16 @@ ApR        |bool   |is_utf8_print  |NN const U8 *p
 ApR    |bool   |is_utf8_punct  |NN const U8 *p
 ApR    |bool   |is_utf8_xdigit |NN const U8 *p
 ApR    |bool   |is_utf8_mark   |NN const U8 *p
+pR     |bool   |is_utf8_X_begin        |NN const U8 *p
+pR     |bool   |is_utf8_X_extend       |NN const U8 *p
+pR     |bool   |is_utf8_X_prepend      |NN const U8 *p
+pR     |bool   |is_utf8_X_non_hangul   |NN const U8 *p
+pR     |bool   |is_utf8_X_L            |NN const U8 *p
+pR     |bool   |is_utf8_X_LV           |NN const U8 *p
+pR     |bool   |is_utf8_X_LVT          |NN const U8 *p
+pR     |bool   |is_utf8_X_LV_LVT_V     |NN const U8 *p
+pR     |bool   |is_utf8_X_T            |NN const U8 *p
+pR     |bool   |is_utf8_X_V            |NN const U8 *p
 : Used in perly.y
 p      |OP*    |jmaybe         |NN OP *o
 : Used in pp.c 
diff --git a/embed.h b/embed.h

index 2af070d..e30e6f2 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -392,6 +392,16 @@
 #define is_utf8_xdigit         Perl_is_utf8_xdigit
 #define is_utf8_mark           Perl_is_utf8_mark
 #ifdef PERL_CORE
+#define is_utf8_X_begin                Perl_is_utf8_X_begin
+#define is_utf8_X_extend       Perl_is_utf8_X_extend
+#define is_utf8_X_prepend      Perl_is_utf8_X_prepend
+#define is_utf8_X_non_hangul   Perl_is_utf8_X_non_hangul
+#define is_utf8_X_L            Perl_is_utf8_X_L
+#define is_utf8_X_LV           Perl_is_utf8_X_LV
+#define is_utf8_X_LVT          Perl_is_utf8_X_LVT
+#define is_utf8_X_LV_LVT_V     Perl_is_utf8_X_LV_LVT_V
+#define is_utf8_X_T            Perl_is_utf8_X_T
+#define is_utf8_X_V            Perl_is_utf8_X_V
 #define jmaybe                 Perl_jmaybe
 #define keyword                        Perl_keyword
 #endif
@@ -2785,6 +2795,16 @@
 #define is_utf8_xdigit(a)      Perl_is_utf8_xdigit(aTHX_ a)
 #define is_utf8_mark(a)                Perl_is_utf8_mark(aTHX_ a)
 #ifdef PERL_CORE
+#define is_utf8_X_begin(a)     Perl_is_utf8_X_begin(aTHX_ a)
+#define is_utf8_X_extend(a)    Perl_is_utf8_X_extend(aTHX_ a)
+#define is_utf8_X_prepend(a)   Perl_is_utf8_X_prepend(aTHX_ a)
+#define is_utf8_X_non_hangul(a)        Perl_is_utf8_X_non_hangul(aTHX_ a)
+#define is_utf8_X_L(a)         Perl_is_utf8_X_L(aTHX_ a)
+#define is_utf8_X_LV(a)                Perl_is_utf8_X_LV(aTHX_ a)
+#define is_utf8_X_LVT(a)       Perl_is_utf8_X_LVT(aTHX_ a)
+#define is_utf8_X_LV_LVT_V(a)  Perl_is_utf8_X_LV_LVT_V(aTHX_ a)
+#define is_utf8_X_T(a)         Perl_is_utf8_X_T(aTHX_ a)
+#define is_utf8_X_V(a)         Perl_is_utf8_X_V(aTHX_ a)
 #define jmaybe(a)              Perl_jmaybe(aTHX_ a)
 #define keyword(a,b,c)         Perl_keyword(aTHX_ a,b,c)
 #endif
diff --git a/embedvar.h b/embedvar.h

index e805a79..63ed46e 100644 (file)
--- a/embedvar.h
+++ b/embedvar.h
@@ -324,6 +324,16 @@
 #define PL_unitcheckav_save    (vTHX->Iunitcheckav_save)
 #define PL_unlockhook          (vTHX->Iunlockhook)
 #define PL_unsafe              (vTHX->Iunsafe)
+#define PL_utf8_X_L            (vTHX->Iutf8_X_L)
+#define PL_utf8_X_LV           (vTHX->Iutf8_X_LV)
+#define PL_utf8_X_LVT          (vTHX->Iutf8_X_LVT)
+#define PL_utf8_X_LV_LVT_V     (vTHX->Iutf8_X_LV_LVT_V)
+#define PL_utf8_X_T            (vTHX->Iutf8_X_T)
+#define PL_utf8_X_V            (vTHX->Iutf8_X_V)
+#define PL_utf8_X_begin                (vTHX->Iutf8_X_begin)
+#define PL_utf8_X_extend       (vTHX->Iutf8_X_extend)
+#define PL_utf8_X_non_hangul   (vTHX->Iutf8_X_non_hangul)
+#define PL_utf8_X_prepend      (vTHX->Iutf8_X_prepend)
 #define PL_utf8_alnum          (vTHX->Iutf8_alnum)
 #define PL_utf8_alpha          (vTHX->Iutf8_alpha)
 #define PL_utf8_ascii          (vTHX->Iutf8_ascii)
@@ -641,6 +651,16 @@
 #define PL_Iunitcheckav_save   PL_unitcheckav_save
 #define PL_Iunlockhook         PL_unlockhook
 #define PL_Iunsafe             PL_unsafe
+#define PL_Iutf8_X_L           PL_utf8_X_L
+#define PL_Iutf8_X_LV          PL_utf8_X_LV
+#define PL_Iutf8_X_LVT         PL_utf8_X_LVT
+#define PL_Iutf8_X_LV_LVT_V    PL_utf8_X_LV_LVT_V
+#define PL_Iutf8_X_T           PL_utf8_X_T
+#define PL_Iutf8_X_V           PL_utf8_X_V
+#define PL_Iutf8_X_begin       PL_utf8_X_begin
+#define PL_Iutf8_X_extend      PL_utf8_X_extend
+#define PL_Iutf8_X_non_hangul  PL_utf8_X_non_hangul
+#define PL_Iutf8_X_prepend     PL_utf8_X_prepend
 #define PL_Iutf8_alnum         PL_utf8_alnum
 #define PL_Iutf8_alpha         PL_utf8_alpha
 #define PL_Iutf8_ascii         PL_utf8_ascii
diff --git a/intrpvar.h b/intrpvar.h

index 650eb62..8fe641c 100644 (file)
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -531,6 +531,16 @@ PERLVAR(Iutf8_print,       SV *)
 PERLVAR(Iutf8_punct,   SV *)
 PERLVAR(Iutf8_xdigit,  SV *)
 PERLVAR(Iutf8_mark,    SV *)
+PERLVAR(Iutf8_X_begin, SV *)
+PERLVAR(Iutf8_X_extend,        SV *)
+PERLVAR(Iutf8_X_prepend,       SV *)
+PERLVAR(Iutf8_X_non_hangul,    SV *)
+PERLVAR(Iutf8_X_L,     SV *)
+PERLVAR(Iutf8_X_LV,    SV *)
+PERLVAR(Iutf8_X_LVT,   SV *)
+PERLVAR(Iutf8_X_T,     SV *)
+PERLVAR(Iutf8_X_V,     SV *)
+PERLVAR(Iutf8_X_LV_LVT_V,      SV *)
 PERLVAR(Iutf8_toupper, SV *)
 PERLVAR(Iutf8_totitle, SV *)
 PERLVAR(Iutf8_tolower, SV *)
diff --git a/lib/unicore/README.perl b/lib/unicore/README.perl

index 7515825..59d66a8 100644 (file)
--- a/lib/unicore/README.perl
+++ b/lib/unicore/README.perl
@@ -5,16 +5,17 @@ The *.txt files were copied from
 with subdirectories 'extracted' and 'auxiliary'
 
 The Unihan files were not included due to space considerations.  Also NOT
-included were any *.html files and *Test.txt files.  It is possible to add the
-Unihan files, and edit mktables (see instructions near its beginning) to look
-at them.
+included were any *.html files.  It is possible to add the Unihan files, and
+edit mktables (see instructions near its beginning) to look at them.
 
 The file 'version' should exist and be a single line with the Unicode version,
 like:
 5.2.0
 
 To be 8.3 filesystem friendly, the names of some of the input files have been
-changed from the values that are in the Unicode DB:
+changed from the values that are in the Unicode DB.  Not all of the Test files
+are currently used, so may not be present, so some of the mv's can fail.  The
+.html Test files are not touched.
 
 mv PropertyValueAliases.txt PropValueAliases.txt
 mv NamedSequencesProv.txt NamedSqProv.txt
@@ -33,6 +34,11 @@ mv extracted/DerivedLineBreak.txt extracted/DLineBreak.txt
 mv extracted/DerivedNumericType.txt extracted/DNumType.txt
 mv extracted/DerivedNumericValues.txt extracted/DNumValues.txt
 
+mv auxiliary/GraphemeBreakTest.txt auxiliary/GCBTest.txt
+mv auxiliary/LineBreakTest.txt auxiliary/LBTest.txt
+mv auxiliary/SentenceBreakTest.txt auxiliary/SBTest.txt
+mv auxiliary/WordBreakTest.txt auxiliary/WBTest.txt
+
 If you have the Unihan database (5.2 and above), you should also do the
 following:
 
@@ -45,9 +51,9 @@ mv Unihan_RadicalStrokeCounts.txt UnihanRadicalStrokeCounts.txt
 mv Unihan_Readings.txt UnihanReadings.txt
 mv Unihan_Variants.txt UnihanVariants.txt
 
-If you download everything, the names of files, such as test files, that are
-not used by mktables are not changed by the above, and will not work correctly
-as-is on 8.3 filesystems.
+If you download everything, the names of files that are not used by mktables
+are not changed by the above, and will not work correctly as-is on 8.3
+filesystems.
 
 mktables is used to generate the tables used by the rest of Perl.  It will warn
 you about any *.txt files in the directory substructure that it doesn't know
@@ -58,17 +64,12 @@ its lists to process.  You can run
 
 to have it try to process these tables generically.
 
-If any files are added, deleted, or their names change, you must run
-
-    mktables -makelist
-
-to generate a new list of all the files.
-
 FOR PUMPKINS
 
 The files are inter-related.  If you take the latest UnicodeData.txt, for
 example, but leave the older versions of other files, there can be subtle
-problems.
+problems.  So get everything available from Unicode, and delete those which
+aren't needed.
 
 When moving to a new version of Unicode, you need to update 'version' by hand
 
@@ -85,27 +86,19 @@ mktables can continue to be used for earlier Unicode versions.
 When putting out a new Perl release, think about if any of the Deprecated
 properties should be moved to Suppressed.
 
-The *.pl files are generated from the *.txt files by the mktables script,
-more recently done during the Perl build process, but if you want to try
-the old manual way:
-       
-       cd lib/unicore
-       p4 edit *.pl */*.pl */*/*.pl
-       perl ./mktables -P ../../pod -T ../../t/re/uniprops.t -makelist
-       p4 revert -a
-       cd ../..
-       perl Porting/manicheck
-       
-If any new (or deleted, unlikely but not impossible) *.pl files are indicated:
-
-       cd lib/unicore
-       p4 add ...
-       p4 delete ...
-       cd ../...
-       p4 edit MANIFEST
-       ...
+The code in regexec.c for the \X match construct is intimately tied to the
+regular expression in UAX #29 (http://www.unicode.org/reports/tr29/).  You
+should see if it has changed, and if so regexec.c should be modified.  The
+current one is
+( CRLF
+| Prepend* ( Hangul-syllable | !Control )
+  ( Grapheme_Extend | Spacing_Mark)*
+| . )
+
+mktables has many checks to warn you if there are unexpected or novel things
+that it doesn't know how to handle.
 
-And finally:
+Finally:
 
        p4 submit
 
diff --git a/lib/unicore/auxiliary/GCBTest.txt b/lib/unicore/auxiliary/GCBTest.txt

new file mode 100644 (file)

index 0000000..7932e4d
--- /dev/null
+++ b/lib/unicore/auxiliary/GCBTest.txt
@@ -0,0 +1,311 @@
+# GraphemeBreakTest-5.2.0.txt
+# Date: 2009-09-19, 00:42:12 GMT [MD]
+#
+# Unicode Character Database
+# Copyright (c) 1991-2009 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For documentation, see http://www.unicode.org/reports/tr44/
+#
+# Default Grapheme Break Test
+#
+# Format:
+# <string> (# <comment>)? 
+#  <string> contains hex Unicode code points, with 
+#      ÷ wherever there is a break opportunity, and 
+#      × wherever there is not.
+#  <comment> the format can change, but currently it shows:
+#      - the sample character name
+#      - (x) the Grapheme_Break property* for the sample character
+#      - [x] the rule that determines whether there is a break or not
+#
+# These samples may be extended or changed in the future.
+#
+÷ 0020 ÷ 0020 ÷     #  ÷ [0.2] SPACE (Other) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0020 × 0308 ÷ 0020 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0020 ÷ 000D ÷     #  ÷ [0.2] SPACE (Other) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0020 × 0308 ÷ 000D ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0020 ÷ 000A ÷     #  ÷ [0.2] SPACE (Other) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0020 × 0308 ÷ 000A ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0020 ÷ 0001 ÷     #  ÷ [0.2] SPACE (Other) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0020 × 0308 ÷ 0001 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0020 × 0300 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0020 × 0308 × 0300 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0020 ÷ 0E40 ÷     #  ÷ [0.2] SPACE (Other) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0020 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0020 × 0903 ÷     #  ÷ [0.2] SPACE (Other) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0020 × 0308 × 0903 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0020 ÷ 1100 ÷     #  ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0020 × 0308 ÷ 1100 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0020 ÷ 1160 ÷     #  ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0020 × 0308 ÷ 1160 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0020 ÷ 11A8 ÷     #  ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0020 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0020 ÷ AC00 ÷     #  ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0020 × 0308 ÷ AC00 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0020 ÷ AC01 ÷     #  ÷ [0.2] SPACE (Other) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0020 × 0308 ÷ AC01 ÷     #  ÷ [0.2] SPACE (Other) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 000D ÷ 0020 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] SPACE (Other) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0020 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 000D ÷ 000D ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 000D ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 000D × 000A ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) × [3.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 000A ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 000D ÷ 0001 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0001 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 000D ÷ 0300 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000D ÷ 0308 × 0300 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000D ÷ 0E40 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 0E40 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 000D ÷ 0903 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 000D ÷ 0308 × 0903 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 000D ÷ 1100 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 1100 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 000D ÷ 1160 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 1160 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 000D ÷ 11A8 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ 11A8 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 000D ÷ AC00 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ AC00 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 000D ÷ AC01 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 000D ÷ 0308 ÷ AC01 ÷     #  ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 000A ÷ 0020 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] SPACE (Other) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0020 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 000A ÷ 000D ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 000D ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 000A ÷ 000A ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 000A ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 000A ÷ 0001 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0001 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 000A ÷ 0300 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000A ÷ 0308 × 0300 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 000A ÷ 0E40 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 0E40 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 000A ÷ 0903 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 000A ÷ 0308 × 0903 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 000A ÷ 1100 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 1100 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 000A ÷ 1160 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 1160 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 000A ÷ 11A8 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ 11A8 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 000A ÷ AC00 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ AC00 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 000A ÷ AC01 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 000A ÷ 0308 ÷ AC01 ÷     #  ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0001 ÷ 0020 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] SPACE (Other) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 0020 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0001 ÷ 000D ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 000D ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0001 ÷ 000A ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 000A ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0001 ÷ 0001 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 0001 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0001 ÷ 0300 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0001 ÷ 0308 × 0300 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0001 ÷ 0E40 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 0E40 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0001 ÷ 0903 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0001 ÷ 0308 × 0903 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0001 ÷ 1100 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 1100 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0001 ÷ 1160 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 1160 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0001 ÷ 11A8 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ 11A8 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0001 ÷ AC00 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ AC00 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0001 ÷ AC01 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0001 ÷ 0308 ÷ AC01 ÷     #  ÷ [0.2] <START OF HEADING> (Control) ÷ [4.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0300 ÷ 0020 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0300 × 0308 ÷ 0020 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0300 ÷ 000D ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0300 × 0308 ÷ 000D ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0300 ÷ 000A ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0300 × 0308 ÷ 000A ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0300 ÷ 0001 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0300 × 0308 ÷ 0001 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0300 × 0300 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0300 × 0308 × 0300 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0300 ÷ 0E40 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0300 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0300 × 0903 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0300 × 0308 × 0903 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0300 ÷ 1100 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0300 × 0308 ÷ 1100 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0300 ÷ 1160 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0300 × 0308 ÷ 1160 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0300 ÷ 11A8 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0300 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0300 ÷ AC00 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0300 × 0308 ÷ AC00 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0300 ÷ AC01 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0300 × 0308 ÷ AC01 ÷     #  ÷ [0.2] COMBINING GRAVE ACCENT (Extend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0E40 × 0020 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] SPACE (Other) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 0020 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0E40 ÷ 000D ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 000D ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0E40 ÷ 000A ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 000A ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0E40 ÷ 0001 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 0001 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0E40 × 0300 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0E40 × 0308 × 0300 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0E40 × 0E40 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0E40 × 0903 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0E40 × 0308 × 0903 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0E40 × 1100 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 1100 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0E40 × 1160 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 1160 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0E40 × 11A8 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0E40 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0E40 × AC00 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0E40 × 0308 ÷ AC00 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0E40 × AC01 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.2] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0E40 × 0308 ÷ AC01 ÷     #  ÷ [0.2] THAI CHARACTER SARA E (Prepend) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0903 ÷ 0020 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0903 × 0308 ÷ 0020 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 0903 ÷ 000D ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0903 × 0308 ÷ 000D ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 0903 ÷ 000A ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0903 × 0308 ÷ 000A ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 0903 ÷ 0001 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0903 × 0308 ÷ 0001 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 0903 × 0300 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0903 × 0308 × 0300 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 0903 ÷ 0E40 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0903 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 0903 × 0903 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0903 × 0308 × 0903 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 0903 ÷ 1100 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0903 × 0308 ÷ 1100 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 0903 ÷ 1160 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0903 × 0308 ÷ 1160 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 0903 ÷ 11A8 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0903 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 0903 ÷ AC00 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0903 × 0308 ÷ AC00 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 0903 ÷ AC01 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 0903 × 0308 ÷ AC01 ÷     #  ÷ [0.2] DEVANAGARI SIGN VISARGA (SpacingMark) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 1100 ÷ 0020 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 1100 × 0308 ÷ 0020 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 1100 ÷ 000D ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 1100 × 0308 ÷ 000D ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 1100 ÷ 000A ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 1100 × 0308 ÷ 000A ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 1100 ÷ 0001 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 1100 × 0308 ÷ 0001 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 1100 × 0300 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 1100 × 0308 × 0300 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 1100 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 1100 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 1100 × 0903 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 1100 × 0308 × 0903 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 1100 × 1100 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 1100 × 0308 ÷ 1100 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 1100 × 1160 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 1100 × 0308 ÷ 1160 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 1100 ÷ 11A8 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 1100 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 1100 × AC00 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 1100 × 0308 ÷ AC00 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 1100 × AC01 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [6.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 1100 × 0308 ÷ AC01 ÷     #  ÷ [0.2] HANGUL CHOSEONG KIYEOK (L) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 1160 ÷ 0020 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 1160 × 0308 ÷ 0020 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 1160 ÷ 000D ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 1160 × 0308 ÷ 000D ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 1160 ÷ 000A ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 1160 × 0308 ÷ 000A ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 1160 ÷ 0001 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 1160 × 0308 ÷ 0001 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 1160 × 0300 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 1160 × 0308 × 0300 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 1160 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 1160 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 1160 × 0903 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 1160 × 0308 × 0903 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 1160 ÷ 1100 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 1160 × 0308 ÷ 1100 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 1160 × 1160 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [7.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 1160 × 0308 ÷ 1160 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 1160 × 11A8 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [7.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 1160 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 1160 ÷ AC00 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 1160 × 0308 ÷ AC00 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 1160 ÷ AC01 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 1160 × 0308 ÷ AC01 ÷     #  ÷ [0.2] HANGUL JUNGSEONG FILLER (V) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 11A8 ÷ 0020 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 0020 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ 11A8 ÷ 000D ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 000D ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ 11A8 ÷ 000A ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 000A ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ 11A8 ÷ 0001 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 0001 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ 11A8 × 0300 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 11A8 × 0308 × 0300 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ 11A8 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ 11A8 × 0903 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 11A8 × 0308 × 0903 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ 11A8 ÷ 1100 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 1100 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ 11A8 ÷ 1160 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 1160 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ 11A8 × 11A8 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [8.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 11A8 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ 11A8 ÷ AC00 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 11A8 × 0308 ÷ AC00 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ 11A8 ÷ AC01 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ 11A8 × 0308 ÷ AC01 ÷     #  ÷ [0.2] HANGUL JONGSEONG KIYEOK (T) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ AC00 ÷ 0020 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ AC00 × 0308 ÷ 0020 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ AC00 ÷ 000D ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ AC00 × 0308 ÷ 000D ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ AC00 ÷ 000A ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ AC00 × 0308 ÷ 000A ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ AC00 ÷ 0001 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ AC00 × 0308 ÷ 0001 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ AC00 × 0300 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ AC00 × 0308 × 0300 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ AC00 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ AC00 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ AC00 × 0903 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ AC00 × 0308 × 0903 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ AC00 ÷ 1100 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ AC00 × 0308 ÷ 1100 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ AC00 × 1160 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [7.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ AC00 × 0308 ÷ 1160 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ AC00 × 11A8 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [7.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ AC00 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ AC00 ÷ AC00 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ AC00 × 0308 ÷ AC00 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ AC00 ÷ AC01 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ AC00 × 0308 ÷ AC01 ÷     #  ÷ [0.2] HANGUL SYLLABLE GA (LV) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ AC01 ÷ 0020 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ AC01 × 0308 ÷ 0020 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] SPACE (Other) ÷ [0.3]
+÷ AC01 ÷ 000D ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ AC01 × 0308 ÷ 000D ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
+÷ AC01 ÷ 000A ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ AC01 × 0308 ÷ 000A ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <LINE FEED (LF)> (LF) ÷ [0.3]
+÷ AC01 ÷ 0001 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ AC01 × 0308 ÷ 0001 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [5.0] <START OF HEADING> (Control) ÷ [0.3]
+÷ AC01 × 0300 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ AC01 × 0308 × 0300 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] COMBINING GRAVE ACCENT (Extend) ÷ [0.3]
+÷ AC01 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ AC01 × 0308 ÷ 0E40 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] THAI CHARACTER SARA E (Prepend) ÷ [0.3]
+÷ AC01 × 0903 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ AC01 × 0308 × 0903 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) × [9.1] DEVANAGARI SIGN VISARGA (SpacingMark) ÷ [0.3]
+÷ AC01 ÷ 1100 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ AC01 × 0308 ÷ 1100 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL CHOSEONG KIYEOK (L) ÷ [0.3]
+÷ AC01 ÷ 1160 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ AC01 × 0308 ÷ 1160 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JUNGSEONG FILLER (V) ÷ [0.3]
+÷ AC01 × 11A8 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [8.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ AC01 × 0308 ÷ 11A8 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL JONGSEONG KIYEOK (T) ÷ [0.3]
+÷ AC01 ÷ AC00 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ AC01 × 0308 ÷ AC00 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GA (LV) ÷ [0.3]
+÷ AC01 ÷ AC01 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+÷ AC01 × 0308 ÷ AC01 ÷     #  ÷ [0.2] HANGUL SYLLABLE GAG (LVT) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HANGUL SYLLABLE GAG (LVT) ÷ [0.3]
+# Lines: 288
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index f39466a..c61a3f4 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -998,6 +998,7 @@ my $MULTIPLE = 4;          # Don't replace, but add a duplicate record if
 my $NORMAL = "";
 my $SUPPRESSED = 'z';   # The character should never actually be seen, since
                         # it is suppressed
+my $PLACEHOLDER = 'P';  # Implies no pod entry generated
 my $DEPRECATED = 'D';
 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
@@ -1019,7 +1020,7 @@ my %status_past_participles = (
     $SUPPRESSED => 'should never be generated',
     $STABILIZED => 'stabilized',
     $OBSOLETE => 'obsolete',
-    $DEPRECATED => 'deprecated'
+    $DEPRECATED => 'deprecated',
 );
 
 # The format of the values of the map tables:
@@ -1074,6 +1075,7 @@ my %Jamo_L;     # Leading consonants
 my %Jamo_V;     # Vowels
 my %Jamo_T;     # Trailing consonants
 
+my @backslash_X_tests;     # List of tests read in for testing \X
 my @unhandled_properties;  # Will contain a list of properties found in
                            # the input that we didn't process.
 my @match_properties;      # Properties that have match tables, to be
@@ -1604,6 +1606,15 @@ sub trace { return main::trace(@_); }
     # processed when you set the $debug_skip global.
     main::set_access('non_skip', \%non_skip, 'c');
 
+    my %skip;
+    # This is used to skip processing of this input file semi-permanently.
+    # It is used for files that we aren't planning to process anytime soon,
+    # but want to allow to be in the directory and not raise a message that we
+    # are not handling.  Mostly for test files.  This is in contrast to the
+    # non_skip element, which is supposed to be used very temporarily for
+    # debugging.  Sets 'optional' to 1
+    main::set_access('skip', \%skip, 'c');
+
     my %each_line_handler;
     # list of subroutines to look at and filter each non-comment line in the
     # file.  defaults to none.  The subroutines are called in order, each is
@@ -1667,6 +1678,7 @@ sub trace { return main::trace(@_); }
         # Set defaults
         $handler{$addr} = \&main::process_generic_property_file;
         $non_skip{$addr} = 0;
+        $skip{$addr} = 0;
         $has_missings_defaults{$addr} = $NO_DEFAULTS;
         $handle{$addr} = undef;
         $added_lines{$addr} = [ ];
@@ -1723,6 +1735,8 @@ sub trace { return main::trace(@_); }
             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
         }
 
+        $optional{$addr} = 1 if $skip{$addr};
+
         return $self;
     }
 
@@ -1771,7 +1785,7 @@ sub trace { return main::trace(@_); }
         }
 
         # File could be optional
-        if ($optional{$addr}){
+        if ($optional{$addr}) {
             return unless -e $file;
             my $result = eval $optional{$addr};
             if (! defined $result) {
@@ -1804,7 +1818,8 @@ sub trace { return main::trace(@_); }
         }
         else {
 
-            # Here, the file exists
+            # Here, the file exists.  Some platforms may change the case of
+            # its name
             if ($seen_non_extracted_non_age) {
                 if ($file =~ /$EXTRACTED/i) {
                     Carp::my_carp_bug(join_lines(<<END
@@ -1837,6 +1852,13 @@ END
                     ! $expecting                    
                     && ! defined $handle{$addr};
 
+            # Having deleted from expected files, we can quit if not to do
+            # anything.  Don't print progress unless really want verbosity
+            if ($skip{$addr}) {
+                print "Skipping $file.\n" if $verbosity >= $VERBOSE;
+                return;
+            }
+
             # Open the file, converting the slashes used in this program
             # into the proper form for the OS
             my $file_handle;
@@ -3846,14 +3868,12 @@ sub trace { return main::trace(@_); }
 
         # For non-ASCII, we shun the characters that don't have Perl encoding-
         # independent symbols for them.  'A' is such a symbol, so is "\n".
-        # Note, this program hopefully will work on 5.8 Perls, and \v is not
-        # such a symbol in them.
         return $try_hard if $non_ASCII
                             && $code <= 0xFF
                             && ($code >= 0x7F
                                 || ($code >= 0x0E && $code <= 0x1F)
                                 || ($code >= 0x01 && $code <= 0x06)
-                                || $code == 0x0B);  # \v introduced after 5.8
+                                || $code == 0x0B);
 
         # shun null.  I'm (khw) not sure why this was done, but NULL would be
         # the character very frequently used.
@@ -4075,7 +4095,6 @@ sub trace { return main::trace(@_); }
         my $complete_name = $complete_name{$addr}
                           = delete $args{'Complete_Name'};
         $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
-        $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0;
         $property{$addr} = delete $args{'_Property'};
         $range_list{$addr} = delete $args{'_Range_List'};
         $status{$addr} = delete $args{'Status'} || $NORMAL;
@@ -4087,6 +4106,7 @@ sub trace { return main::trace(@_); }
         my $loose_match = delete $args{'Fuzzy'};
         my $note = delete $args{'Note'};
         my $make_pod_entry = delete $args{'Pod_Entry'};
+        my $perl_extension = delete $args{'Perl_Extension'};
 
         # Shouldn't have any left over
         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
@@ -4105,11 +4125,20 @@ sub trace { return main::trace(@_); }
         push @{$description{$addr}}, $description if $description;
         push @{$note{$addr}}, $note if $note;
 
-        # If hasn't set its status already, see if it is on one of the lists
-        # of properties or tables that have particular statuses; if not, is
-        # normal.  The lists are prioritized so the most serious ones are
-        # checked first
-        if (! $status{$addr}) {
+        if ($status{$addr} eq $PLACEHOLDER) {
+
+            # A placeholder table doesn't get documented, is a perl extension,
+            # and quite likely will be empty
+            $make_pod_entry = 0 if ! defined $make_pod_entry;
+            $perl_extension = 1 if ! defined $perl_extension;
+            push @tables_that_may_be_empty, $complete_name{$addr};
+        }
+        elsif (! $status{$addr}) {
+
+            # If hasn't set its status already, see if it is on one of the
+            # lists of properties or tables that have particular statuses; if
+            # not, is normal.  The lists are prioritized so the most serious
+            # ones are checked first
             if (exists $why_suppressed{$complete_name}) {
                 $status{$addr} = $SUPPRESSED;
             }
@@ -4145,6 +4174,8 @@ sub trace { return main::trace(@_); }
             }
         }
 
+        $perl_extension{$addr} = $perl_extension || 0;
+
         # By convention what typically gets printed only or first is what's
         # first in the list, so put the full name there for good output
         # clarity.  Other routines rely on the full name being first on the
@@ -6204,7 +6235,17 @@ END
                     my $flag = $property->status
                                 || $table->status
                                 || $table_alias_object->status;
-                    $flags{$flag} = $status_past_participles{$flag} if $flag;
+                    if ($flag) {
+                        if ($flag ne $PLACEHOLDER) {
+                            $flags{$flag} = $status_past_participles{$flag};
+                        } else {
+                            $flags{$flag} = <<END;
+a placeholder because it is not in Version $string_version of Unicode, but is
+needed by the Perl core to work gracefully.  Because it is not in this version
+of Unicode, it will not be listed in $pod_file.pod
+END
+                        }
+                    }
 
                     $loose_count++;
 
@@ -6221,6 +6262,9 @@ END
                 push @note, $table->note;
                 push @conflicting, $table->conflicting;
 
+                # And this for output after all the tables.
+                push @global_comments, $table->comment;
+
                 # Compute an alternate compound name using the final property
                 # synonym and the first table synonym with a colon instead of
                 # the equal sign used elsewhere.
@@ -6306,8 +6350,10 @@ END
         if (%flags) {
             foreach my $flag (sort keys %flags) {
                 $comment .= <<END;
-'$flag' below means that this form is $flags{$flag}.  Consult $pod_file.pod
+'$flag' below means that this form is $flags{$flag}.
 END
+                next if $flag eq $PLACEHOLDER;
+                $comment .= "Consult $pod_file.pod\n";
             }
             $comment .= "\n";
         }
@@ -6317,7 +6363,7 @@ This file returns the $code_points in Unicode Version $string_version that
 $match$synonyms:
 
 $matches_comment
-$pod_file.pod should be consulted for the rules on using $any_of_these,
+$pod_file.pod should be consulted for the syntax rules for $any_of_these,
 including if adding or subtracting white space, underscore, and hyphen
 characters matters or doesn't matter, and other permissible syntactic
 variants.  Upper/lower case distinctions never matter.
@@ -6346,7 +6392,9 @@ END
 
         # And append any comment(s) from the actual tables.  They are all
         # gathered here, so may not read all that well.
-        $comment .= "\n" . join "\n\n", @global_comments if @global_comments;
+        if (@global_comments) {
+            $comment .= "\n" . join("\n\n", @global_comments) . "\n";
+        }
 
         if ($count) {   # The format differs if no code points, and needs no
                         # explanation in that case
@@ -9503,6 +9551,18 @@ END
     }
 } # End closure for UnicodeData
 
+sub process_GCB_test {
+
+    my $file = shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    while ($file->next_line) {
+        push @backslash_X_tests, $_;
+    }
+        
+    return;
+}
+
 sub process_NamedSequences {
     # NamedSequences.txt entries are just added to an array.  Because these
     # don't look like the other tables, they have their own handler.
@@ -10795,21 +10855,78 @@ sub compile_perl() {
     }
 
     # These are used in Unicode's definition of \X
+    my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
+    my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
+
     my $gcb = property_ref('Grapheme_Cluster_Break');
-    #my $extend = $perl->add_match_table('_X_Extend');
-    my $extend = $perl->add_match_table('_GCB_Extend');
-    # XXX until decide what todo my $begin = $perl->add_match_table('_X_Begin');
-    if (defined $gcb) {
-        $extend += $gcb->table('Extend') + $gcb->table('SpacingMark')
-        #$begin += ~ ($gcb->table('Control')
-        #             + $gcb->table('CR')
-        #             + $gcb->table('LF'));
+
+    # The 'extended' grapheme cluster came in 5.1.  The non-extended 
+    # definition differs too much from the traditional Perl one to use.
+    if (defined $gcb && defined $gcb->table('SpacingMark')) {
+
+        # Note that assumes HST is defined; it came in an earlier release than
+        # GCB.  In the line below, two negatives means: yes hangul
+        $begin += ~ property_ref('Hangul_Syllable_Type')
+                                                    ->table('Not_Applicable')
+               + ~ ($gcb->table('Control')
+                    + $gcb->table('CR')
+                    + $gcb->table('LF'));
+        $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
+
+        $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
+        $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
     }
     else {    # Old definition, used on early releases.
         $extend += $gc->table('Mark')
-                    + 0x200C    # ZWNJ
-                    + 0x200D;    # ZWJ
-        #$begin += ~ $extend;
+                + 0x200C    # ZWNJ
+                + 0x200D;   # ZWJ
+        $begin += ~ $extend;
+
+        # Here we may have a release that has the regular grapheme cluster
+        # defined, or a release that doesn't have anything defined.
+        # We set things up so the Perl core degrades gracefully, possibly with
+        # placeholders that match nothing.
+
+        if (! defined $gcb) {
+            $gcb = Property->new('GCB', Status => $PLACEHOLDER);
+        }
+        my $hst = property_ref('HST');
+        if (!defined $hst) {
+            $hst = Property->new('HST', Status => $PLACEHOLDER);
+            $hst->add_match_table('Not_Applicable',
+                                Initialize => $Any,
+                                Matches_All => 1);
+        }
+
+        # On some releases, here we may not have the needed tables for the
+        # perl core, in some releases we may.
+        foreach my $name (qw{ L LV LVT T V prepend }) {
+            my $table = $gcb->table($name);
+            if (! defined $table) {
+                $table = $gcb->add_match_table($name);
+                push @tables_that_may_be_empty, $table->complete_name;
+            }
+
+            # The HST property predates the GCB one, and has identical tables
+            # for some of them, so use it if we can.
+            if ($table->is_empty
+                && defined $hst
+                && defined $hst->table($name))
+            {
+                $table += $hst->table($name);
+            }
+        }
+    }
+
+    # More GCB.  If we found some hangul syllables, populate a combined
+    # table.
+    my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
+    my $LV = $gcb->table('LV');
+    if ($LV->is_empty) {
+        push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
+    } else {
+        $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
+        $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
     }
 
     # Create a new property specially located that is a combination of the
@@ -13231,6 +13348,11 @@ sub make_property_test_script() {
             }
         }
     }
+
+    foreach my $test (@backslash_X_tests) {
+        print $OUT "Test_X('$test');\n";
+    }
+
     print $OUT "Finished();\n";
     close $OUT;
     return;
@@ -13380,6 +13502,9 @@ my @input_file_objects = (
     Input_file->new('BidiMirroring.txt', v3.0.1,
                     Property => 'Bidi_Mirroring_Glyph',
                     ),
+    Input_file->new("NormalizationTest.txt", v3.0.1,
+                    Skip => 1,
+                    ),
     Input_file->new('CaseFolding.txt', v3.0.1,
                     Pre_Handler => \&setup_case_folding,
                     Each_Line_Handler =>
@@ -13417,6 +13542,18 @@ my @input_file_objects = (
                     Property => 'Grapheme_Cluster_Break',
                     Has_Missings_Defaults => $NOT_IGNORED,
                     ),
+    Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
+                    Handler => \&process_GCB_test,
+                    ),
+    Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
+                    Skip => 1,
+                    ),
+    Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
+                    Skip => 1,
+                    ),
+    Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
+                    Skip => 1,
+                    ),
     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
                     Property => 'Sentence_Break',
                     Has_Missings_Defaults => $NOT_IGNORED,
@@ -13427,6 +13564,9 @@ my @input_file_objects = (
     Input_file->new('NameAliases.txt', v5.0.0,
                     Property => 'Name_Alias',
                     ),
+    Input_file->new("BidiTest.txt", v5.2.0,
+                    Skip => 1,
+                    ),
     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
                     Optional => 1,
                     Each_Line_Handler => \&filter_unihan_line,
@@ -13474,18 +13614,16 @@ END
 
 # Put into %potential_files a list of all the files in the directory structure
 # that could be inputs to this program, excluding those that we should ignore.
-# Also don't consider test files.  Use absolute file names because it makes it
-# easier across machine types.
+# Use absolute file names because it makes it easier across machine types.
 my @ignored_files_full_names = map { File::Spec->rel2abs(
                                      internal_file_to_platform($_))
                                 } keys %ignored_files;
 File::Find::find({
     wanted=>sub {
-        return unless /\.txt$/i;
-        return if /Test\.txt$/i;
+        return unless /\.txt$/i;  # Some platforms change the name's case
         my $full = lc(File::Spec->rel2abs($_));
         $potential_files{$full} = 1
-                        if ! grep { $full eq lc($_) } @ignored_files_full_names;
+                    if ! grep { $full eq lc($_) } @ignored_files_full_names;
         return;
     }
 }, File::Spec->curdir());
@@ -13584,7 +13722,7 @@ if ($glob_list) {
                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
                 {
                     splice @input_file_objects, $i, 0,
-                                                    Input_file->new($file, v0);
+                                                Input_file->new($file, v0);
                     last;
                 }
             }
@@ -13758,28 +13896,53 @@ __DATA__
 use strict;
 use warnings;
 
-# Test the \p{} regular expression constructs.  This file is constructed by
-# mktables from the tables it generates, so if mktables is buggy, this won't
-# necessarily catch those bugs.  Tests are generated for all feasible
-# properties; a few aren't currently feasible; see is_code_point_usable()
-# in mktables for details.
+# Test qr/\X/ and the \p{} regular expression constructs.  This file is
+# constructed by mktables from the tables it generates, so if mktables is
+# buggy, this won't necessarily catch those bugs.  Tests are generated for all
+# feasible properties; a few aren't currently feasible; see
+# is_code_point_usable() in mktables for details.
 
 # Standard test packages are not used because this manipulates SIG_WARN.  It
 # exits 0 if every non-skipped test succeeded; -1 if any failed.
 
 my $Tests = 0;
 my $Fails = 0;
-my $Skips = 0;
 
 my $non_ASCII = (ord('A') != 65);
 
-# The first 127 ASCII characters in ordinal order, with the ones that don't
-# have Perl names (as of 5.8) replaced by dots.  The 127th is used as the
-# string delimiter
-my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
-#for my $i (0..126) {
-#    print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n";
-#}
+# The 256 8-bit characters in ASCII ordinal order, with the ones that don't
+# have Perl names replaced by -1
+my @ascii_ordered_chars = (
+    "\0",
+    (-1) x 6,
+    "\a", "\b", "\t", "\n",
+    -1,   # No Vt 
+    "\f", "\r",
+    (-1) x 18,
+    " ", "!", "\"", "#", '$', "%", "&", "'",
+    "(", ")", "*", "+", ",", "-", ".", "/",
+    "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
+    ":", ";", "<", "=", ">", "?", "@",
+    "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
+    "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
+    "[", "\\", "]", "^", "_", "`",
+    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
+    "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
+    "{", "|", "}", "~",
+    (-1) x 129
+);
+
+sub ASCII_ord_to_native ($) {
+    # Converts input ordinal number to the native one, if can be done easily.
+    # Returns -1 otherwise.
+
+    my $ord = shift;
+
+    return $ord if $ord > 255 || ! $non_ASCII;
+    my $result = $ascii_ordered_chars[$ord];
+    return $result if $result eq '-1';
+    return ord($result);
+}
 
 sub Expect($$$$) {
     my $expected = shift;
@@ -13789,38 +13952,24 @@ sub Expect($$$$) {
                                 # or empty if none
     my $line   = (caller)[2];
 
+    # Convert the non-ASCII code points expressible as characters to their
+    # ASCII equivalents, and skip the others.
+    $ord = ASCII_ord_to_native($ord);
+    if ($ord < 0) {
+        $Tests++;
+        print "ok $Tests - "
+              . sprintf("\"\\x{%04X}\"", $ord)
+              . " =~ $regex # Skipped: non-ASCII\n";
+        return;
+    }
+
     # Convert the code point to hex form
     my $string = sprintf "\"\\x{%04X}\"", $ord;
 
-    # Convert the non-ASCII code points expressible as characters in Perl 5.8
-    # to their ASCII equivalents, and skip the others.
-    if ($non_ASCII && $ord < 255) {
-
-        # Dots are used as place holders in the conversion string for the
-        # non-convertible ones, so check for it first.
-        if ($ord == 0x2E) {
-            $ord = ord('.');
-        }
-        elsif ($ord < 0x7F
-                  # Any dots returned are non-convertible.
-                 && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.'))
-        {
-            #print STDERR "$ord, $char, \n";
-            $ord = ord($char);
-        }
-        else {
-            $Tests++;
-            $Skips++;
-            print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n";
-            return;
-        }
-    }
-
-    # The first time through, use all warnings.
     my @tests = "";
 
-    # If the input should generate a warning, add another time through with
-    # them turned off
+    # The first time through, use all warnings.  If the input should generate
+    # a warning, add another time through with them turned off
     push @tests, "no warnings '$warning_type';" if $warning_type;
 
     foreach my $no_warnings (@tests) {
@@ -13880,9 +14029,142 @@ sub Error($) {
     return;
 }
 
+# GCBTest.txt character that separates grapheme clusters
+my $breakable_utf8 = my $breakable = chr(0xF7);
+utf8::upgrade($breakable_utf8);
+
+# GCBTest.txt character that indicates that the adjoining code points are part
+# of the same grapheme cluster
+my $nobreak_utf8 = my $nobreak = chr(0xD7);
+utf8::upgrade($nobreak_utf8);
+
+sub Test_X($) {
+    # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
+    # Each such line is a sequence of code points given by their hex numbers,
+    # separated by the two characters defined just before this subroutine that
+    # indicate that either there can or cannot be a break between the adjacent
+    # code points.  If there isn't a break, that means the sequence forms an
+    # extended grapheme cluster, which means that \X should match the whole
+    # thing.  If there is a break, \X should stop there.  This is all
+    # converted by this routine into a match:
+    #   $string =~ /(\X)/,
+    # Each \X should match the next cluster; and that is what is checked.
+
+    my $template = shift;
+
+    my $line   = (caller)[2];
+
+    # The line contains characters above the ASCII range, but in Latin1.  It
+    # may or may not be in utf8, and if it is, it may or may not know it.  So,
+    # convert these characters to 8 bits.  If knows is in utf8, simply
+    # downgrade.
+    if (utf8::is_utf8($template)) {
+        utf8::downgrade($template);
+    } else {
+
+        # Otherwise, if it is in utf8, but doesn't know it, the next lines
+        # convert the two problematic characters to their 8-bit equivalents.
+        # If it isn't in utf8, they don't harm anything.
+        use bytes;
+        $template =~ s/$nobreak_utf8/$nobreak/g;
+        $template =~ s/$breakable_utf8/$breakable/g;
+    }
+
+    # Get rid of the leading and trailing breakables
+    $template =~ s/^ \s* $breakable \s* //x;
+    $template =~ s/ \s* $breakable \s* $ //x;
+
+    # And no-breaks become just a space.
+    $template =~ s/ \s* $nobreak \s* / /xg;
+
+    # Split the input into segments that are breakable between them.
+    my @segments = split /\s*$breakable\s*/, $template;
+
+    my $string = "";
+    my $display_string = "";
+    my @should_match;
+    my @should_display;
+
+    # Convert the code point sequence in each segment into a Perl string of
+    # characters
+    foreach my $segment (@segments) {
+        my @code_points = split /\s+/, $segment;
+        my $this_string = "";
+        my $this_display = "";
+        foreach my $code_point (@code_points) {
+            my $ord = ASCII_ord_to_native(hex $code_point);
+            if ($ord < 0) {
+                $Tests++;
+                print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n";
+                return;
+            }
+            $this_string .= chr $ord;
+            $this_display .= "\\x{$code_point}";
+        }
+
+        # The next cluster should match the string in this segment.
+        push @should_match, $this_string;
+        push @should_display, $this_display;
+        $string .= $this_string;
+        $display_string .= $this_display;
+    }
+
+    # If a string can be represented in both non-ut8 and utf8, test both cases
+    UPGRADE:
+    for my $to_upgrade (0 .. 1) {
+        
+        if ($to_upgrade) {
+
+            # If already in utf8, would just be a repeat
+            next UPGRADE if utf8::is_utf8($string);
+
+            utf8::upgrade($string);
+        }
+
+        # Finally, do the \X match.
+        my @matches = $string =~ /(\X)/g;
+
+        # Look through each matched cluster to verify that it matches what we
+        # expect.
+        my $min = (@matches < @should_match) ? @matches : @should_match;
+        for my $i (0 .. $min - 1) {
+            $Tests++;
+            if ($matches[$i] eq $should_match[$i]) {
+                print "ok $Tests - ";
+                if ($i == 0) {
+                    print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
+                } else {
+                    print "And \\X #", $i + 1,
+                }
+                print " correctly matched $should_display[$i]; line $line\n";
+            } else {
+                $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
+                                                    unpack("U*", $matches[$i]));
+                print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
+                    $i + 1,
+                    " should have matched $should_display[$i]",
+                    " but instead matched $matches[$i]",
+                    ".  Abandoning rest of line $line\n";
+                next UPGRADE;
+            }
+        }
+
+        # And the number of matches should equal the number of expected matches.
+        $Tests++;
+        if (@matches == @should_match) {
+            print "ok $Tests - Nothing was left over; line $line\n";
+        } else {
+            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
+        }
+    }
+
+    return;
+}
+
 sub Finished() {
     print "1..$Tests\n";
     exit($Fails ? -1 : 0);
 }
 
 Error('\p{Script=InGreek}');    # Bug #69018
+Test_X("1100 $nobreak 1161");  # Bug #70940
diff --git a/perlapi.h b/perlapi.h

index 5c2df74..54ddab0 100644 (file)
--- a/perlapi.h
+++ b/perlapi.h
@@ -684,6 +684,26 @@ END_EXTERN_C
 #define PL_unlockhook          (*Perl_Iunlockhook_ptr(aTHX))
 #undef  PL_unsafe
 #define PL_unsafe              (*Perl_Iunsafe_ptr(aTHX))
+#undef  PL_utf8_X_L
+#define PL_utf8_X_L            (*Perl_Iutf8_X_L_ptr(aTHX))
+#undef  PL_utf8_X_LV
+#define PL_utf8_X_LV           (*Perl_Iutf8_X_LV_ptr(aTHX))
+#undef  PL_utf8_X_LVT
+#define PL_utf8_X_LVT          (*Perl_Iutf8_X_LVT_ptr(aTHX))
+#undef  PL_utf8_X_LV_LVT_V
+#define PL_utf8_X_LV_LVT_V     (*Perl_Iutf8_X_LV_LVT_V_ptr(aTHX))
+#undef  PL_utf8_X_T
+#define PL_utf8_X_T            (*Perl_Iutf8_X_T_ptr(aTHX))
+#undef  PL_utf8_X_V
+#define PL_utf8_X_V            (*Perl_Iutf8_X_V_ptr(aTHX))
+#undef  PL_utf8_X_begin
+#define PL_utf8_X_begin                (*Perl_Iutf8_X_begin_ptr(aTHX))
+#undef  PL_utf8_X_extend
+#define PL_utf8_X_extend       (*Perl_Iutf8_X_extend_ptr(aTHX))
+#undef  PL_utf8_X_non_hangul
+#define PL_utf8_X_non_hangul   (*Perl_Iutf8_X_non_hangul_ptr(aTHX))
+#undef  PL_utf8_X_prepend
+#define PL_utf8_X_prepend      (*Perl_Iutf8_X_prepend_ptr(aTHX))
 #undef  PL_utf8_alnum
 #define PL_utf8_alnum          (*Perl_Iutf8_alnum_ptr(aTHX))
 #undef  PL_utf8_alpha
diff --git a/pod/perl5113delta.pod b/pod/perl5113delta.pod

index 2e1ddf8..ec2443c 100644 (file)
--- a/pod/perl5113delta.pod
+++ b/pod/perl5113delta.pod
@@ -38,6 +38,12 @@ Perl is shipped with the latest Unicode version, 5.2, October 2009.  See
 L<http://www.unicode.org/versions/Unicode5.2.0> for details about this release
 of Unicode.
 
+But, an installation can now fairly easily change Perl to operate on any
+Unicode release.  Perl is shipped with the latest official release, but
+an installation can download and install any prior release from Unicode, and
+cause Perl to work with that (or even multiple releases).  Instructions are in
+L<perlunicode>.
+
 =head2 Unicode properties
 
 Perl can now handle every Unicode character property.  A new pod,
@@ -58,6 +64,15 @@ underscores between digits of numbers.
 All the Unicode-defined synonyms for properties and property values are
 now accepted.
 
+C<qr/\X/>, which matches a Unicode logical character, has been expanded to work
+better with various Asian languages.  It now is defined as an C<extended
+grapheme cluster>.  (See L<http://www.unicode.org/reports/tr29/>).  One change
+due to this is that C<\X> will match the whole sequence C<S<CR LF>>.  Another
+change is that C<\X> will match an isolated mark.  Marks generally come after a
+base character, but it is possible in Unicode to have them in isolation, and
+C<\X> will now handle that case.  Otherwise, this change should be transparent
+for the non-affected languages.
+
 C<\p{...}> matches using the Canonical_Combining_Class property were
 completely broken in previous Perls.  This is now fixed.
 
@@ -120,11 +135,6 @@ Other_Default_Ignorable_Code_Point, Other_Grapheme_Extend,
 Other_ID_Continue, Other_ID_Start, Other_Lowercase, Other_Math, and
 Other_Uppercase.
 
-An installation can now fairly easily change Perl to operate on any
-Unicode release.  Perl is shipped with the latest official release, but
-an installation can now download any prior release, and Perl will work
-with that.  Instructions are in L<perlunicode>.
-
 An installation can now fairly easily change which Unicode properties
 Perl understands.  As mentioned above, certain properties are by default
 turned off.  These include all the Unihan properties (which should be
diff --git a/proto.h b/proto.h

index eab3e82..02fdd2d 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1414,6 +1414,66 @@ PERL_CALLCONV bool       Perl_is_utf8_mark(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_MARK  \
        assert(p)
 
+PERL_CALLCONV bool     Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN       \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_extend(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND      \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND     \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL  \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_L(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_L   \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_LV(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_LV  \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_LVT \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V    \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_T(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_T   \
+       assert(p)
+
+PERL_CALLCONV bool     Perl_is_utf8_X_V(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_X_V   \
+       assert(p)
+
 PERL_CALLCONV OP*      Perl_jmaybe(pTHX_ OP *o)
                        __attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_JMAYBE        \
diff --git a/regexec.c b/regexec.c

index 06fe13a..b01a99b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -120,11 +120,31 @@
 /* these are unrolled below in the CCC_TRY_XXX defined */
 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
+
+/* Doesn't do an assert to verify that is correct */
+#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
+    if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
+
 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
-#define LOAD_UTF8_CHARCLASS_MARK()  LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
 
+#define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
+       LOAD_UTF8_CHARCLASS(X_begin, " "),                                  \
+       LOAD_UTF8_CHARCLASS(X_non_hangul, "A"),                             \
+       /* These are utf8 constants, and not utf-ebcdic constants, so the   \
+           * assert should likely and hopefully fail on an EBCDIC machine */ \
+       LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"), /* U+0300 */             \
+                                                                           \
+       /* No asserts are done for these, in case called on an early        \
+           * Unicode version in which they map to nothing */               \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend),/* U+0E40 "\xe0\xb9\x80" */ \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_L),          /* U+1100 "\xe1\x84\x80" */ \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV),     /* U+AC00 "\xea\xb0\x80" */ \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT),    /* U+AC01 "\xea\xb0\x81" */ \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V),/* U+AC01 "\xea\xb0\x81" */\
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_T),      /* U+11A8 "\xe1\x86\xa8" */ \
+       LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */  
 
 /* 
    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
@@ -3521,22 +3541,216 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
        CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
        CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
 
-       case CLUMP:
+       case CLUMP: /* Match \X: logical Unicode character.  This is defined as
+                      a Unicode extended Grapheme Cluster */
+           /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
+             extended Grapheme Cluster is:
+
+              CR LF
+              | Prepend* Begin Extend*
+              | .
+
+              Begin is (Hangul-syllable | ! Control)
+              Extend is (Grapheme_Extend | Spacing_Mark)
+              Control is [ GCB_Control CR LF ]
+
+              The discussion below shows how the code for CLUMP is derived
+              from this regex.  Note that most of these concepts are from
+              property values of the Grapheme Cluster Boundary (GCB) property.
+              No code point can have multiple property values for a given
+              property.  Thus a code point in Prepend can't be in Control, but
+              it must be in !Control.  This is why Control above includes
+              GCB_Control plus CR plus LF.  The latter two are used in the GCB
+              property separately, and so can't be in GCB_Control, even though
+              they logically are controls.  Control is not the same as gc=cc,
+              but includes format and other characters as well.
+
+              The Unicode definition of Hangul-syllable is:
+                  L+
+                  | (L* ( ( V | LV ) V* | LVT ) T*)
+                  | T+ 
+                 )
+              Each of these is a value for the GCB property, and hence must be
+              disjoint, so the order they are tested is immaterial, so the
+              above can safely be changed to
+                  T+
+                  | L+
+                  | (L* ( LVT | ( V | LV ) V*) T*)
+
+              The last two terms can be combined like this:
+                  L* ( L
+                       | (( LVT | ( V | LV ) V*) T*))
+
+              And refactored into this:
+                  L* (L | LVT T* | V  V* T* | LV  V* T*)
+
+              That means that if we have seen any L's at all we can quit
+              there, but if the next character is a LVT, a V or and LV we
+              should keep going.
+
+              There is a subtlety with Prepend* which showed up in testing.
+              Note that the Begin, and only the Begin is required in:
+               | Prepend* Begin Extend*
+              Also, Begin contains '! Control'.  A Prepend must be a '!
+              Control', which means it must be a Begin.  What it comes down to
+              is that if we match Prepend* and then find no suitable Begin
+              afterwards, that if we backtrack the last Prepend, that one will
+              be a suitable Begin.
+           */
+
            if (locinput >= PL_regeol)
                sayNO;
-           if  (do_utf8) {
-               LOAD_UTF8_CHARCLASS_MARK();
-               if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
-                   sayNO;
-               locinput += PL_utf8skip[nextchr];
-               while (locinput < PL_regeol &&
-                      swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
-                   locinput += UTF8SKIP(locinput);
-               if (locinput > PL_regeol)
-                   sayNO;
-           } 
-           else
-              locinput++;
+           if  (! do_utf8) {
+
+               /* Match either CR LF  or '.', as all the other possibilities
+                * require utf8 */
+               locinput++;         /* Match the . or CR */
+               if (nextchr == '\r'
+                   && locinput < PL_regeol
+                   && UCHARAT(locinput) == '\n') locinput++;
+           }
+           else {
+
+               /* Utf8: See if is ( CR LF ); already know that locinput <
+                * PL_regeol, so locinput+1 is in bounds */
+               if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
+                   locinput += 2;
+               }
+               else {
+                   /* In case have to backtrack to beginning, then match '.' */
+                   char *starting = locinput;
+
+                   /* In case have to backtrack the last prepend */
+                   char *previous_prepend = 0;
+
+                   LOAD_UTF8_CHARCLASS_GCB();
+
+                   /* Match (prepend)* */
+                   while (locinput < PL_regeol
+                          && swash_fetch(PL_utf8_X_prepend,
+                                         (U8*)locinput, do_utf8))
+                   {
+                       previous_prepend = locinput;
+                       locinput += UTF8SKIP(locinput);
+                   }
+
+                   /* As noted above, if we matched a prepend character, but
+                    * the next thing won't match, back off the last prepend we
+                    * matched, as it is guaranteed to match the begin */
+                   if (previous_prepend
+                       && (locinput >=  PL_regeol
+                           || ! swash_fetch(PL_utf8_X_begin,
+                                            (U8*)locinput, do_utf8)))
+                   {
+                       locinput = previous_prepend;
+                   }
+
+                   /* Note that here we know PL_regeol > locinput, as we
+                    * tested that upon input to this switch case, and if we
+                    * moved locinput forward, we tested the result just above
+                    * and it either passed, or we backed off so that it will
+                    * now pass */
+                   if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
+
+                       /* Here did not match the required 'Begin' in the
+                        * second term.  So just match the very first
+                        * character, the '.' of the final term of the regex */
+                       locinput = starting + UTF8SKIP(starting);
+                   } else {
+
+                       /* Here is the beginning of a character that can have
+                        * an extender.  It is either a hangul syllable, or a
+                        * non-control */
+                       if (swash_fetch(PL_utf8_X_non_hangul,
+                                       (U8*)locinput, do_utf8))
+                       {
+
+                           /* Here not a Hangul syllable, must be a
+                            * ('!  * Control') */
+                           locinput += UTF8SKIP(locinput);
+                       } else {
+
+                           /* Here is a Hangul syllable.  It can be composed
+                            * of several individual characters.  One
+                            * possibility is T+ */
+                           if (swash_fetch(PL_utf8_X_T,
+                                           (U8*)locinput, do_utf8))
+                           {
+                               while (locinput < PL_regeol
+                                       && swash_fetch(PL_utf8_X_T,
+                                                       (U8*)locinput, do_utf8))
+                               {
+                                   locinput += UTF8SKIP(locinput);
+                               }
+                           } else {
+
+                               /* Here, not T+, but is a Hangul.  That means
+                                * it is one of the others: L, LV, LVT or V,
+                                * and matches:
+                                * L* (L | LVT T* | V  V* T* | LV  V* T*) */
+
+                               /* Match L*           */
+                               while (locinput < PL_regeol
+                                       && swash_fetch(PL_utf8_X_L,
+                                                       (U8*)locinput, do_utf8))
+                               {
+                                   locinput += UTF8SKIP(locinput);
+                               }
+
+                               /* Here, have exhausted L*.  If the next
+                                * character is not an LV, LVT nor V, it means
+                                * we had to have at least one L, so matches L+
+                                * in the original equation, we have a complete
+                                * hangul syllable.  Are done. */
+
+                               if (locinput < PL_regeol
+                                   && swash_fetch(PL_utf8_X_LV_LVT_V,
+                                                   (U8*)locinput, do_utf8))
+                               {
+
+                                   /* Otherwise keep going.  Must be LV, LVT
+                                    * or V.  See if LVT */
+                                   if (swash_fetch(PL_utf8_X_LVT,
+                                                   (U8*)locinput, do_utf8))
+                                   {
+                                       locinput += UTF8SKIP(locinput);
+                                   } else {
+
+                                       /* Must be  V or LV.  Take it, then
+                                        * match V*     */
+                                       locinput += UTF8SKIP(locinput);
+                                       while (locinput < PL_regeol
+                                               && swash_fetch(PL_utf8_X_V,
+                                                        (U8*)locinput, do_utf8))
+                                       {
+                                           locinput += UTF8SKIP(locinput);
+                                       }
+                                   }
+
+                                   /* And any of LV, LVT, or V can be followed
+                                    * by T*            */
+                                   while (locinput < PL_regeol
+                                          && swash_fetch(PL_utf8_X_T,
+                                                          (U8*)locinput,
+                                                          do_utf8))
+                                   {
+                                       locinput += UTF8SKIP(locinput);
+                                   }
+                               }
+                           }
+                       }
+
+                       /* Match any extender */
+                       while (locinput < PL_regeol
+                               && swash_fetch(PL_utf8_X_extend,
+                                               (U8*)locinput, do_utf8))
+                       {
+                           locinput += UTF8SKIP(locinput);
+                       }
+                   }
+               }
+               if (locinput > PL_regeol) sayNO;
+           }
            nextchr = UCHARAT(locinput);
            break;
             
diff --git a/sv.c b/sv.c

index a2c383c..b5cb17f 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -12214,6 +12214,16 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_punct      = sv_dup_inc(proto_perl->Iutf8_punct, param);
     PL_utf8_xdigit     = sv_dup_inc(proto_perl->Iutf8_xdigit, param);
     PL_utf8_mark       = sv_dup_inc(proto_perl->Iutf8_mark, param);
+    PL_utf8_X_begin    = sv_dup_inc(proto_perl->Iutf8_X_begin, param);
+    PL_utf8_X_extend   = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
+    PL_utf8_X_prepend  = sv_dup_inc(proto_perl->Iutf8_X_prepend, param);
+    PL_utf8_X_non_hangul       = sv_dup_inc(proto_perl->Iutf8_X_non_hangul, param);
+    PL_utf8_X_L        = sv_dup_inc(proto_perl->Iutf8_X_L, param);
+    PL_utf8_X_LV       = sv_dup_inc(proto_perl->Iutf8_X_LV, param);
+    PL_utf8_X_LVT      = sv_dup_inc(proto_perl->Iutf8_X_LVT, param);
+    PL_utf8_X_T        = sv_dup_inc(proto_perl->Iutf8_X_T, param);
+    PL_utf8_X_V        = sv_dup_inc(proto_perl->Iutf8_X_V, param);
+    PL_utf8_X_LV_LVT_V = sv_dup_inc(proto_perl->Iutf8_X_LV_LVT_V, param);
     PL_utf8_toupper    = sv_dup_inc(proto_perl->Iutf8_toupper, param);
     PL_utf8_totitle    = sv_dup_inc(proto_perl->Iutf8_totitle, param);
     PL_utf8_tolower    = sv_dup_inc(proto_perl->Iutf8_tolower, param);
diff --git a/utf8.c b/utf8.c

index c504891..5f3c990 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1488,6 +1488,106 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
     return is_utf8_common(p, &PL_utf8_mark, "IsM");
 }
 
+bool
+Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+
+    return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+}
+
+bool
+Perl_is_utf8_X_extend(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
+
+    return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
+}
+
+bool
+Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
+
+    return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
+}
+
+bool
+Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
+
+    return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
+}
+
+bool
+Perl_is_utf8_X_L(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_L;
+
+    return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
+}
+
+bool
+Perl_is_utf8_X_LV(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LV;
+
+    return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
+}
+
+bool
+Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
+
+    return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
+}
+
+bool
+Perl_is_utf8_X_T(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_T;
+
+    return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
+}
+
+bool
+Perl_is_utf8_X_V(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_V;
+
+    return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
+}
+
+bool
+Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
+
+    return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
+}
+
 /*
 =for apidoc to_utf8_case
 
@@ -1532,6 +1632,22 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
 
     if (!*swashp) /* load on-demand */
          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
+    /* This is the beginnings of a skeleton of code to read the info section
+     * that is in all the swashes in case we ever want to do that, so one can
+     * read things whose maps aren't code points, and whose default if missing
+     * is not to the code point itself.  This was just to see if it actually
+     * worked.  Details on what the possibilities are are in perluniprops.pod
+       HV * const hv = get_hv("utf8::SwashInfo", 0);
+       if (hv) {
+        SV **svp;
+        svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
+            const char *s;
+
+             HV * const this_hash = SvRV(*svp);
+               svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
+             s = SvPV_const(*svp, len);
+       }
+    }*/
 
     /* The 0xDF is the only special casing Unicode code point below 0x100. */
     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
@@ -1594,7 +1710,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
         }
     }
 
-    if (!len) /* Neither: just copy. */
+    if (!len) /* Neither: just copy.  In other words, there was no mapping
+                defined, which means that the code point maps to itself */
         len = uvchr_to_utf8(ustrp, uv0) - ustrp;
 
     if (lenp)
@@ -1809,7 +1926,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
        ptr = tmputf8;
     }
     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
-     * then the "swatch" is a vec() for al the chars which start
+     * then the "swatch" is a vec() for all the chars which start
      * with 0xAA..0xYY
      * So the key in the hash (klen) is length of encoded char -1
      */
@@ -1817,7 +1934,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
     off  = ptr[klen];
 
     if (klen == 0) {
-      /* If char in invariant then swatch is for all the invariant chars
+      /* If char is invariant then swatch is for all the invariant chars
        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
        */
        needents = UTF_CONTINUATION_MARK;
diff --git a/utf8.h b/utf8.h

index 19f2174..8fef274 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -73,21 +73,20 @@ END_EXTERN_C
 
    U+0000..U+007F      00..7F
    U+0080..U+07FF      C2..DF    80..BF
-   U+0800..U+0FFF      E0        A0..BF    80..BF
+   U+0800..U+0FFF      E0      * A0..BF    80..BF
    U+1000..U+CFFF       E1..EC    80..BF    80..BF
-   U+D000..U+D7FF       ED        80..9F    80..BF
-   U+D800..U+DFFF       ******* ill-formed *******
+   U+D000..U+D7FF       ED      * 80..9F    80..BF
+   U+D800..U+DFFF       +++++++ utf16 surrogates, not legal utf8 +++++++
    U+E000..U+FFFF       EE..EF    80..BF    80..BF
-  U+10000..U+3FFFF     F0        90..BF    80..BF    80..BF
+  U+10000..U+3FFFF     F0      * 90..BF    80..BF    80..BF
   U+40000..U+FFFFF     F1..F3    80..BF    80..BF    80..BF
  U+100000..U+10FFFF    F4        80..8F    80..BF    80..BF
 
-Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
-the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
-The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings:
-it is technically possible to UTF-8-encode a single code point in different
-ways, but that is explicitly forbidden, and the shortest possible encoding
-should always be used (and that is what Perl does).
+Note the gaps before the 2nd Byte entries above marked by '*'.  These are
+caused by legal UTF-8 avoiding non-shortest encodings: it is technically
+possible to UTF-8-encode a single code point in different ways, but that is
+explicitly forbidden, and the shortest possible encoding should always be used
+(and that is what Perl does).
 
  */
 
@@ -213,11 +212,12 @@ encoded character.
 #define UTF8_ALLOW_EMPTY               0x0001
 #define UTF8_ALLOW_CONTINUATION                0x0002
 #define UTF8_ALLOW_NON_CONTINUATION    0x0004
-#define UTF8_ALLOW_FE_FF               0x0008 /* Allow above 0x7fffFFFF */
-#define UTF8_ALLOW_SHORT               0x0010
+#define UTF8_ALLOW_FE_FF               0x0008 /* Allow FE or FF start bytes, \
+                                                 yields above 0x7fffFFFF */
+#define UTF8_ALLOW_SHORT               0x0010 /* expecting more bytes */
 #define UTF8_ALLOW_SURROGATE           0x0020
 #define UTF8_ALLOW_FFFF                        0x0040 /* Allow UNICODE_ILLEGAL */
-#define UTF8_ALLOW_LONG                        0x0080
+#define UTF8_ALLOW_LONG                        0x0080 /* expecting fewer bytes */
 #define UTF8_ALLOW_ANYUV               (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
                                         UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
 #define UTF8_ALLOW_ANY                 0x00FF
MANIFEST		patch \| blob \| blame \| history
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
embedvar.h		patch \| blob \| blame \| history
intrpvar.h		patch \| blob \| blame \| history
lib/unicore/README.perl		patch \| blob \| blame \| history
lib/unicore/auxiliary/GCBTest.txt	[new file with mode: 0644]	patch \| blob
lib/unicore/mktables		patch \| blob \| blame \| history
perlapi.h		patch \| blob \| blame \| history
pod/perl5113delta.pod		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history
sv.c		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history