From: Jarkko Hietaniemi Date: Tue, 6 Jul 1999 21:47:04 +0000 (+0000) Subject: POSIX [[:character class:]] support for standard, locale, X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b8c5462f6edbb2dd616e1733df011beee816eee1;p=p5sagit%2Fp5-mst-13.2.git POSIX [[:character class:]] support for standard, locale, and utf8. If both utf8 and locale are on, utf8 wins. I don't fully understand why so many tables changed in lib/unicode because of "make" -- maybe it was just overdue. p4raw-id: //depot/cfgperl@3624 --- diff --git a/MANIFEST b/MANIFEST index 11543e1..6aa3d5f 100644 --- a/MANIFEST +++ b/MANIFEST @@ -826,6 +826,7 @@ lib/unicode/Is/So.pl Unicode character database lib/unicode/Is/Space.pl Unicode character database lib/unicode/Is/Syllable.pl Unicode character database lib/unicode/Is/Upper.pl Unicode character database +lib/unicode/Is/Word.pl Unicode character database lib/unicode/Is/Z.pl Unicode character database lib/unicode/Is/Zl.pl Unicode character database lib/unicode/Is/Zp.pl Unicode character database diff --git a/Todo-5.005 b/Todo-5.005 index a8831b1..b700603 100644 --- a/Todo-5.005 +++ b/Todo-5.005 @@ -36,8 +36,8 @@ Locales decimal separator (3,1415927 is Europeanese for an approximation of pi) Regexen - POSIX [:foo:] character classes - ([=bar=] and [.zap.] would nice too but there's no API for them) + POSIX [=bar=] and [.zap.] would nice too but there's no API for them + (=bar= could be done with Unicode, though) approximate matching Reliable Signals diff --git a/embed.h b/embed.h index 0871c6f..ba07096 100644 --- a/embed.h +++ b/embed.h @@ -203,35 +203,53 @@ #define io_close Perl_io_close #define invert Perl_invert #define is_uni_alnum Perl_is_uni_alnum +#define is_uni_alnumc Perl_is_uni_alnumc #define is_uni_idfirst Perl_is_uni_idfirst #define is_uni_alpha Perl_is_uni_alpha +#define is_uni_ascii Perl_is_uni_ascii #define is_uni_space Perl_is_uni_space +#define is_uni_cntrl Perl_is_uni_cntrl +#define is_uni_graph Perl_is_uni_graph #define is_uni_digit Perl_is_uni_digit #define is_uni_upper Perl_is_uni_upper #define is_uni_lower Perl_is_uni_lower #define is_uni_print Perl_is_uni_print +#define is_uni_punct Perl_is_uni_punct +#define is_uni_xdigit Perl_is_uni_xdigit #define to_uni_upper Perl_to_uni_upper #define to_uni_title Perl_to_uni_title #define to_uni_lower Perl_to_uni_lower #define is_uni_alnum_lc Perl_is_uni_alnum_lc +#define is_uni_alnumc_lc Perl_is_uni_alnumc_lc #define is_uni_idfirst_lc Perl_is_uni_idfirst_lc #define is_uni_alpha_lc Perl_is_uni_alpha_lc +#define is_uni_ascii_lc Perl_is_uni_ascii_lc #define is_uni_space_lc Perl_is_uni_space_lc +#define is_uni_cntrl_lc Perl_is_uni_cntrl_lc +#define is_uni_graph_lc Perl_is_uni_graph_lc #define is_uni_digit_lc Perl_is_uni_digit_lc #define is_uni_upper_lc Perl_is_uni_upper_lc #define is_uni_lower_lc Perl_is_uni_lower_lc #define is_uni_print_lc Perl_is_uni_print_lc +#define is_uni_punct_lc Perl_is_uni_punct_lc +#define is_uni_xdigit_lc Perl_is_uni_xdigit_lc #define to_uni_upper_lc Perl_to_uni_upper_lc #define to_uni_title_lc Perl_to_uni_title_lc #define to_uni_lower_lc Perl_to_uni_lower_lc #define is_utf8_alnum Perl_is_utf8_alnum +#define is_utf8_alnumc Perl_is_utf8_alnumc #define is_utf8_idfirst Perl_is_utf8_idfirst #define is_utf8_alpha Perl_is_utf8_alpha +#define is_utf8_ascii Perl_is_utf8_ascii #define is_utf8_space Perl_is_utf8_space +#define is_utf8_cntrl Perl_is_utf8_cntrl #define is_utf8_digit Perl_is_utf8_digit +#define is_utf8_graph Perl_is_utf8_graph #define is_utf8_upper Perl_is_utf8_upper #define is_utf8_lower Perl_is_utf8_lower #define is_utf8_print Perl_is_utf8_print +#define is_utf8_punct Perl_is_utf8_punct +#define is_utf8_xdigit Perl_is_utf8_xdigit #define is_utf8_mark Perl_is_utf8_mark #define jmaybe Perl_jmaybe #define keyword Perl_keyword @@ -846,6 +864,7 @@ #define add_data S_add_data #define re_croak2 S_re_croak2 #define regpposixcc S_regpposixcc +#define checkposixcc S_checkposixcc #define clear_re S_clear_re #endif #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT) @@ -986,6 +1005,7 @@ #define ck_require Perl_ck_require #define ck_rfun Perl_ck_rfun #define ck_rvconst Perl_ck_rvconst +#define ck_sassign Perl_ck_sassign #define ck_scmp Perl_ck_scmp #define ck_select Perl_ck_select #define ck_shift Perl_ck_shift @@ -1520,35 +1540,53 @@ #define io_close(a) Perl_io_close(aTHX_ a) #define invert(a) Perl_invert(aTHX_ a) #define is_uni_alnum(a) Perl_is_uni_alnum(aTHX_ a) +#define is_uni_alnumc(a) Perl_is_uni_alnumc(aTHX_ a) #define is_uni_idfirst(a) Perl_is_uni_idfirst(aTHX_ a) #define is_uni_alpha(a) Perl_is_uni_alpha(aTHX_ a) +#define is_uni_ascii(a) Perl_is_uni_ascii(aTHX_ a) #define is_uni_space(a) Perl_is_uni_space(aTHX_ a) +#define is_uni_cntrl(a) Perl_is_uni_cntrl(aTHX_ a) +#define is_uni_graph(a) Perl_is_uni_graph(aTHX_ a) #define is_uni_digit(a) Perl_is_uni_digit(aTHX_ a) #define is_uni_upper(a) Perl_is_uni_upper(aTHX_ a) #define is_uni_lower(a) Perl_is_uni_lower(aTHX_ a) #define is_uni_print(a) Perl_is_uni_print(aTHX_ a) +#define is_uni_punct(a) Perl_is_uni_punct(aTHX_ a) +#define is_uni_xdigit(a) Perl_is_uni_xdigit(aTHX_ a) #define to_uni_upper(a) Perl_to_uni_upper(aTHX_ a) #define to_uni_title(a) Perl_to_uni_title(aTHX_ a) #define to_uni_lower(a) Perl_to_uni_lower(aTHX_ a) #define is_uni_alnum_lc(a) Perl_is_uni_alnum_lc(aTHX_ a) +#define is_uni_alnumc_lc(a) Perl_is_uni_alnumc_lc(aTHX_ a) #define is_uni_idfirst_lc(a) Perl_is_uni_idfirst_lc(aTHX_ a) #define is_uni_alpha_lc(a) Perl_is_uni_alpha_lc(aTHX_ a) +#define is_uni_ascii_lc(a) Perl_is_uni_ascii_lc(aTHX_ a) #define is_uni_space_lc(a) Perl_is_uni_space_lc(aTHX_ a) +#define is_uni_cntrl_lc(a) Perl_is_uni_cntrl_lc(aTHX_ a) +#define is_uni_graph_lc(a) Perl_is_uni_graph_lc(aTHX_ a) #define is_uni_digit_lc(a) Perl_is_uni_digit_lc(aTHX_ a) #define is_uni_upper_lc(a) Perl_is_uni_upper_lc(aTHX_ a) #define is_uni_lower_lc(a) Perl_is_uni_lower_lc(aTHX_ a) #define is_uni_print_lc(a) Perl_is_uni_print_lc(aTHX_ a) +#define is_uni_punct_lc(a) Perl_is_uni_punct_lc(aTHX_ a) +#define is_uni_xdigit_lc(a) Perl_is_uni_xdigit_lc(aTHX_ a) #define to_uni_upper_lc(a) Perl_to_uni_upper_lc(aTHX_ a) #define to_uni_title_lc(a) Perl_to_uni_title_lc(aTHX_ a) #define to_uni_lower_lc(a) Perl_to_uni_lower_lc(aTHX_ a) #define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a) +#define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a) #define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a) #define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a) +#define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a) #define is_utf8_space(a) Perl_is_utf8_space(aTHX_ a) +#define is_utf8_cntrl(a) Perl_is_utf8_cntrl(aTHX_ a) #define is_utf8_digit(a) Perl_is_utf8_digit(aTHX_ a) +#define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a) #define is_utf8_upper(a) Perl_is_utf8_upper(aTHX_ a) #define is_utf8_lower(a) Perl_is_utf8_lower(aTHX_ a) #define is_utf8_print(a) Perl_is_utf8_print(aTHX_ a) +#define is_utf8_punct(a) Perl_is_utf8_punct(aTHX_ a) +#define is_utf8_xdigit(a) Perl_is_utf8_xdigit(aTHX_ a) #define is_utf8_mark(a) Perl_is_utf8_mark(aTHX_ a) #define jmaybe(a) Perl_jmaybe(aTHX_ a) #define keyword(a,b) Perl_keyword(aTHX_ a,b) @@ -2153,6 +2191,7 @@ #define study_chunk(a,b,c,d,e) S_study_chunk(aTHX_ a,b,c,d,e) #define add_data(a,b) S_add_data(aTHX_ a,b) #define regpposixcc(a) S_regpposixcc(aTHX_ a) +#define checkposixcc() S_checkposixcc(aTHX) #define clear_re(a) S_clear_re(aTHX_ a) #endif #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT) @@ -2293,6 +2332,7 @@ #define ck_require(a) Perl_ck_require(aTHX_ a) #define ck_rfun(a) Perl_ck_rfun(aTHX_ a) #define ck_rvconst(a) Perl_ck_rvconst(aTHX_ a) +#define ck_sassign(a) Perl_ck_sassign(aTHX_ a) #define ck_scmp(a) Perl_ck_scmp(aTHX_ a) #define ck_select(a) Perl_ck_select(aTHX_ a) #define ck_shift(a) Perl_ck_shift(aTHX_ a) @@ -3014,12 +3054,20 @@ #define invert Perl_invert #define Perl_is_uni_alnum CPerlObj::Perl_is_uni_alnum #define is_uni_alnum Perl_is_uni_alnum +#define Perl_is_uni_alnumc CPerlObj::Perl_is_uni_alnumc +#define is_uni_alnumc Perl_is_uni_alnumc #define Perl_is_uni_idfirst CPerlObj::Perl_is_uni_idfirst #define is_uni_idfirst Perl_is_uni_idfirst #define Perl_is_uni_alpha CPerlObj::Perl_is_uni_alpha #define is_uni_alpha Perl_is_uni_alpha +#define Perl_is_uni_ascii CPerlObj::Perl_is_uni_ascii +#define is_uni_ascii Perl_is_uni_ascii #define Perl_is_uni_space CPerlObj::Perl_is_uni_space #define is_uni_space Perl_is_uni_space +#define Perl_is_uni_cntrl CPerlObj::Perl_is_uni_cntrl +#define is_uni_cntrl Perl_is_uni_cntrl +#define Perl_is_uni_graph CPerlObj::Perl_is_uni_graph +#define is_uni_graph Perl_is_uni_graph #define Perl_is_uni_digit CPerlObj::Perl_is_uni_digit #define is_uni_digit Perl_is_uni_digit #define Perl_is_uni_upper CPerlObj::Perl_is_uni_upper @@ -3028,6 +3076,10 @@ #define is_uni_lower Perl_is_uni_lower #define Perl_is_uni_print CPerlObj::Perl_is_uni_print #define is_uni_print Perl_is_uni_print +#define Perl_is_uni_punct CPerlObj::Perl_is_uni_punct +#define is_uni_punct Perl_is_uni_punct +#define Perl_is_uni_xdigit CPerlObj::Perl_is_uni_xdigit +#define is_uni_xdigit Perl_is_uni_xdigit #define Perl_to_uni_upper CPerlObj::Perl_to_uni_upper #define to_uni_upper Perl_to_uni_upper #define Perl_to_uni_title CPerlObj::Perl_to_uni_title @@ -3036,12 +3088,20 @@ #define to_uni_lower Perl_to_uni_lower #define Perl_is_uni_alnum_lc CPerlObj::Perl_is_uni_alnum_lc #define is_uni_alnum_lc Perl_is_uni_alnum_lc +#define Perl_is_uni_alnumc_lc CPerlObj::Perl_is_uni_alnumc_lc +#define is_uni_alnumc_lc Perl_is_uni_alnumc_lc #define Perl_is_uni_idfirst_lc CPerlObj::Perl_is_uni_idfirst_lc #define is_uni_idfirst_lc Perl_is_uni_idfirst_lc #define Perl_is_uni_alpha_lc CPerlObj::Perl_is_uni_alpha_lc #define is_uni_alpha_lc Perl_is_uni_alpha_lc +#define Perl_is_uni_ascii_lc CPerlObj::Perl_is_uni_ascii_lc +#define is_uni_ascii_lc Perl_is_uni_ascii_lc #define Perl_is_uni_space_lc CPerlObj::Perl_is_uni_space_lc #define is_uni_space_lc Perl_is_uni_space_lc +#define Perl_is_uni_cntrl_lc CPerlObj::Perl_is_uni_cntrl_lc +#define is_uni_cntrl_lc Perl_is_uni_cntrl_lc +#define Perl_is_uni_graph_lc CPerlObj::Perl_is_uni_graph_lc +#define is_uni_graph_lc Perl_is_uni_graph_lc #define Perl_is_uni_digit_lc CPerlObj::Perl_is_uni_digit_lc #define is_uni_digit_lc Perl_is_uni_digit_lc #define Perl_is_uni_upper_lc CPerlObj::Perl_is_uni_upper_lc @@ -3050,6 +3110,10 @@ #define is_uni_lower_lc Perl_is_uni_lower_lc #define Perl_is_uni_print_lc CPerlObj::Perl_is_uni_print_lc #define is_uni_print_lc Perl_is_uni_print_lc +#define Perl_is_uni_punct_lc CPerlObj::Perl_is_uni_punct_lc +#define is_uni_punct_lc Perl_is_uni_punct_lc +#define Perl_is_uni_xdigit_lc CPerlObj::Perl_is_uni_xdigit_lc +#define is_uni_xdigit_lc Perl_is_uni_xdigit_lc #define Perl_to_uni_upper_lc CPerlObj::Perl_to_uni_upper_lc #define to_uni_upper_lc Perl_to_uni_upper_lc #define Perl_to_uni_title_lc CPerlObj::Perl_to_uni_title_lc @@ -3058,20 +3122,32 @@ #define to_uni_lower_lc Perl_to_uni_lower_lc #define Perl_is_utf8_alnum CPerlObj::Perl_is_utf8_alnum #define is_utf8_alnum Perl_is_utf8_alnum +#define Perl_is_utf8_alnumc CPerlObj::Perl_is_utf8_alnumc +#define is_utf8_alnumc Perl_is_utf8_alnumc #define Perl_is_utf8_idfirst CPerlObj::Perl_is_utf8_idfirst #define is_utf8_idfirst Perl_is_utf8_idfirst #define Perl_is_utf8_alpha CPerlObj::Perl_is_utf8_alpha #define is_utf8_alpha Perl_is_utf8_alpha +#define Perl_is_utf8_ascii CPerlObj::Perl_is_utf8_ascii +#define is_utf8_ascii Perl_is_utf8_ascii #define Perl_is_utf8_space CPerlObj::Perl_is_utf8_space #define is_utf8_space Perl_is_utf8_space +#define Perl_is_utf8_cntrl CPerlObj::Perl_is_utf8_cntrl +#define is_utf8_cntrl Perl_is_utf8_cntrl #define Perl_is_utf8_digit CPerlObj::Perl_is_utf8_digit #define is_utf8_digit Perl_is_utf8_digit +#define Perl_is_utf8_graph CPerlObj::Perl_is_utf8_graph +#define is_utf8_graph Perl_is_utf8_graph #define Perl_is_utf8_upper CPerlObj::Perl_is_utf8_upper #define is_utf8_upper Perl_is_utf8_upper #define Perl_is_utf8_lower CPerlObj::Perl_is_utf8_lower #define is_utf8_lower Perl_is_utf8_lower #define Perl_is_utf8_print CPerlObj::Perl_is_utf8_print #define is_utf8_print Perl_is_utf8_print +#define Perl_is_utf8_punct CPerlObj::Perl_is_utf8_punct +#define is_utf8_punct Perl_is_utf8_punct +#define Perl_is_utf8_xdigit CPerlObj::Perl_is_utf8_xdigit +#define is_utf8_xdigit Perl_is_utf8_xdigit #define Perl_is_utf8_mark CPerlObj::Perl_is_utf8_mark #define is_utf8_mark Perl_is_utf8_mark #define Perl_jmaybe CPerlObj::Perl_jmaybe @@ -4235,6 +4311,8 @@ #define re_croak2 S_re_croak2 #define S_regpposixcc CPerlObj::S_regpposixcc #define regpposixcc S_regpposixcc +#define S_checkposixcc CPerlObj::S_checkposixcc +#define checkposixcc S_checkposixcc #define S_clear_re CPerlObj::S_clear_re #define clear_re S_clear_re #endif @@ -4489,6 +4567,8 @@ #define ck_rfun Perl_ck_rfun #define Perl_ck_rvconst CPerlObj::Perl_ck_rvconst #define ck_rvconst Perl_ck_rvconst +#define Perl_ck_sassign CPerlObj::Perl_ck_sassign +#define ck_sassign Perl_ck_sassign #define Perl_ck_scmp CPerlObj::Perl_ck_scmp #define ck_scmp Perl_ck_scmp #define Perl_ck_select CPerlObj::Perl_ck_select diff --git a/embed.pl b/embed.pl index ed7f3e4..206dbbf 100755 --- a/embed.pl +++ b/embed.pl @@ -947,35 +947,53 @@ p |char* |instr |const char* big|const char* little p |bool |io_close |IO* io p |OP* |invert |OP* cmd p |bool |is_uni_alnum |U32 c +p |bool |is_uni_alnumc |U32 c p |bool |is_uni_idfirst |U32 c p |bool |is_uni_alpha |U32 c +p |bool |is_uni_ascii |U32 c p |bool |is_uni_space |U32 c +p |bool |is_uni_cntrl |U32 c +p |bool |is_uni_graph |U32 c p |bool |is_uni_digit |U32 c p |bool |is_uni_upper |U32 c p |bool |is_uni_lower |U32 c p |bool |is_uni_print |U32 c +p |bool |is_uni_punct |U32 c +p |bool |is_uni_xdigit |U32 c p |U32 |to_uni_upper |U32 c p |U32 |to_uni_title |U32 c p |U32 |to_uni_lower |U32 c p |bool |is_uni_alnum_lc|U32 c +p |bool |is_uni_alnumc_lc|U32 c p |bool |is_uni_idfirst_lc|U32 c p |bool |is_uni_alpha_lc|U32 c +p |bool |is_uni_ascii_lc|U32 c p |bool |is_uni_space_lc|U32 c +p |bool |is_uni_cntrl_lc|U32 c +p |bool |is_uni_graph_lc|U32 c p |bool |is_uni_digit_lc|U32 c p |bool |is_uni_upper_lc|U32 c p |bool |is_uni_lower_lc|U32 c p |bool |is_uni_print_lc|U32 c +p |bool |is_uni_punct_lc|U32 c +p |bool |is_uni_xdigit_lc|U32 c p |U32 |to_uni_upper_lc|U32 c p |U32 |to_uni_title_lc|U32 c p |U32 |to_uni_lower_lc|U32 c p |bool |is_utf8_alnum |U8 *p +p |bool |is_utf8_alnumc |U8 *p p |bool |is_utf8_idfirst|U8 *p p |bool |is_utf8_alpha |U8 *p +p |bool |is_utf8_ascii |U8 *p p |bool |is_utf8_space |U8 *p +p |bool |is_utf8_cntrl |U8 *p p |bool |is_utf8_digit |U8 *p +p |bool |is_utf8_graph |U8 *p p |bool |is_utf8_upper |U8 *p p |bool |is_utf8_lower |U8 *p p |bool |is_utf8_print |U8 *p +p |bool |is_utf8_punct |U8 *p +p |bool |is_utf8_xdigit |U8 *p p |bool |is_utf8_mark |U8 *p p |OP* |jmaybe |OP* arg p |I32 |keyword |char* d|I32 len @@ -1646,7 +1664,8 @@ s |I32 |study_chunk |regnode **scanp|I32 *deltap \ |regnode *last|scan_data_t *data|U32 flags s |I32 |add_data |I32 n|char *s rs |void|re_croak2 |const char* pat1|const char* pat2|... -s |char*|regpposixcc |I32 value +s |I32 |regpposixcc |I32 value +s |void |checkposixcc s |void |clear_re |void *r #endif diff --git a/embedvar.h b/embedvar.h index f759b63..74e7ca5 100644 --- a/embedvar.h +++ b/embedvar.h @@ -508,16 +508,22 @@ #define PL_uid (PL_curinterp->Iuid) #define PL_unsafe (PL_curinterp->Iunsafe) #define PL_utf8_alnum (PL_curinterp->Iutf8_alnum) +#define PL_utf8_alnumc (PL_curinterp->Iutf8_alnumc) #define PL_utf8_alpha (PL_curinterp->Iutf8_alpha) +#define PL_utf8_ascii (PL_curinterp->Iutf8_ascii) +#define PL_utf8_cntrl (PL_curinterp->Iutf8_cntrl) #define PL_utf8_digit (PL_curinterp->Iutf8_digit) +#define PL_utf8_graph (PL_curinterp->Iutf8_graph) #define PL_utf8_lower (PL_curinterp->Iutf8_lower) #define PL_utf8_mark (PL_curinterp->Iutf8_mark) #define PL_utf8_print (PL_curinterp->Iutf8_print) +#define PL_utf8_punct (PL_curinterp->Iutf8_punct) #define PL_utf8_space (PL_curinterp->Iutf8_space) #define PL_utf8_tolower (PL_curinterp->Iutf8_tolower) #define PL_utf8_totitle (PL_curinterp->Iutf8_totitle) #define PL_utf8_toupper (PL_curinterp->Iutf8_toupper) #define PL_utf8_upper (PL_curinterp->Iutf8_upper) +#define PL_utf8_xdigit (PL_curinterp->Iutf8_xdigit) #define PL_uudmap (PL_curinterp->Iuudmap) #define PL_warnhook (PL_curinterp->Iwarnhook) #define PL_xiv_arenaroot (PL_curinterp->Ixiv_arenaroot) @@ -765,16 +771,22 @@ #define PL_Iuid PL_uid #define PL_Iunsafe PL_unsafe #define PL_Iutf8_alnum PL_utf8_alnum +#define PL_Iutf8_alnumc PL_utf8_alnumc #define PL_Iutf8_alpha PL_utf8_alpha +#define PL_Iutf8_ascii PL_utf8_ascii +#define PL_Iutf8_cntrl PL_utf8_cntrl #define PL_Iutf8_digit PL_utf8_digit +#define PL_Iutf8_graph PL_utf8_graph #define PL_Iutf8_lower PL_utf8_lower #define PL_Iutf8_mark PL_utf8_mark #define PL_Iutf8_print PL_utf8_print +#define PL_Iutf8_punct PL_utf8_punct #define PL_Iutf8_space PL_utf8_space #define PL_Iutf8_tolower PL_utf8_tolower #define PL_Iutf8_totitle PL_utf8_totitle #define PL_Iutf8_toupper PL_utf8_toupper #define PL_Iutf8_upper PL_utf8_upper +#define PL_Iutf8_xdigit PL_utf8_xdigit #define PL_Iuudmap PL_uudmap #define PL_Iwarnhook PL_warnhook #define PL_Ixiv_arenaroot PL_xiv_arenaroot diff --git a/global.sym b/global.sym index 87ece3c..c5597d1 100644 --- a/global.sym +++ b/global.sym @@ -177,35 +177,53 @@ Perl_instr Perl_io_close Perl_invert Perl_is_uni_alnum +Perl_is_uni_alnumc Perl_is_uni_idfirst Perl_is_uni_alpha +Perl_is_uni_ascii Perl_is_uni_space +Perl_is_uni_cntrl +Perl_is_uni_graph Perl_is_uni_digit Perl_is_uni_upper Perl_is_uni_lower Perl_is_uni_print +Perl_is_uni_punct +Perl_is_uni_xdigit Perl_to_uni_upper Perl_to_uni_title Perl_to_uni_lower Perl_is_uni_alnum_lc +Perl_is_uni_alnumc_lc Perl_is_uni_idfirst_lc Perl_is_uni_alpha_lc +Perl_is_uni_ascii_lc Perl_is_uni_space_lc +Perl_is_uni_cntrl_lc +Perl_is_uni_graph_lc Perl_is_uni_digit_lc Perl_is_uni_upper_lc Perl_is_uni_lower_lc Perl_is_uni_print_lc +Perl_is_uni_punct_lc +Perl_is_uni_xdigit_lc Perl_to_uni_upper_lc Perl_to_uni_title_lc Perl_to_uni_lower_lc Perl_is_utf8_alnum +Perl_is_utf8_alnumc Perl_is_utf8_idfirst Perl_is_utf8_alpha +Perl_is_utf8_ascii Perl_is_utf8_space +Perl_is_utf8_cntrl Perl_is_utf8_digit +Perl_is_utf8_graph Perl_is_utf8_upper Perl_is_utf8_lower Perl_is_utf8_print +Perl_is_utf8_punct +Perl_is_utf8_xdigit Perl_is_utf8_mark Perl_jmaybe Perl_keyword diff --git a/handy.h b/handy.h index 851f348..95bcec7 100644 --- a/handy.h +++ b/handy.h @@ -215,13 +215,25 @@ typedef unsigned short U16; /* In EBCDIC we do not do locales: therefore() isupper() is fine. */ # define isUPPER(c) isupper(c) # define isLOWER(c) islower(c) +# define isALNUMC(c) isalnum(c) +# define isASCII(c) isascii(c) +# define isCNTRL(c) iscntrl(c) +# define isGRAPH(c) isgraph(c) # define isPRINT(c) isprint(c) +# define isPUNCT(c) ispunct(c) +# define isXDIGIT(c) isxdigit(c) # define toUPPER(c) toupper(c) # define toLOWER(c) tolower(c) #else # define isUPPER(c) ((c) >= 'A' && (c) <= 'Z') # define isLOWER(c) ((c) >= 'a' && (c) <= 'z') +# define isALNUMC(c) (isALPHA(c) || isDIGIT(c)) +# define isASCII(c) ((c) <= 127) +# define isCNTRL(c) ((c) < ' ') +# define isGRAPH(c) (isALNUM(c) || isPUNCT(c)) # define isPRINT(c) (((c) > 32 && (c) < 127) || isSPACE(c)) +# define isPUNCT(c) (((c) >= 33 && (c) <= 47) || ((c) >= 58 && (c) <= 64) || ((c) >= 91 && (c) <= 96) || ((c) >= 123 && (c) <= 126)) +# define isXDIGIT(c) (isdigit(c) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F')) # define toUPPER(c) (isLOWER(c) ? (c) - ('a' - 'A') : (c)) # define toLOWER(c) (isUPPER(c) ? (c) + ('a' - 'A') : (c)) #endif @@ -229,8 +241,7 @@ typedef unsigned short U16; #ifdef USE_NEXT_CTYPE # define isALNUM_LC(c) \ - (NXIsAlpha((unsigned int)(c)) || NXIsDigit((unsigned int)(c)) || \ - (char)(c) == '_') + (NXIsAlnum((unsigned int)(c)) || (char)(c) == '_') # define isIDFIRST_LC(c) \ (NXIsAlpha((unsigned int)(c)) || (char)(c) == '_') # define isALPHA_LC(c) NXIsAlpha((unsigned int)(c)) @@ -238,37 +249,47 @@ typedef unsigned short U16; # define isDIGIT_LC(c) NXIsDigit((unsigned int)(c)) # define isUPPER_LC(c) NXIsUpper((unsigned int)(c)) # define isLOWER_LC(c) NXIsLower((unsigned int)(c)) +# define isALNUMC_LC(c) NXIsAlnum((unsigned int)(c)) +# define isCNTRL_LC(c) NXIsCntrl((unsigned int)(c)) +# define isGRAPH_LC(c) NXIsGraph((unsigned int)(c)) # define isPRINT_LC(c) NXIsPrint((unsigned int)(c)) +# define isPUNCT_LC(c) NXIsPunct((unsigned int)(c)) # define toUPPER_LC(c) NXToUpper((unsigned int)(c)) # define toLOWER_LC(c) NXToLower((unsigned int)(c)) #else /* !USE_NEXT_CTYPE */ + # if defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII)) -# define isALNUM_LC(c) \ - (isalpha((unsigned char)(c)) || \ - isdigit((unsigned char)(c)) || (char)(c) == '_') +# define isALNUM_LC(c) (isalnum((unsigned char)(c)) || (char)(c) == '_') # define isIDFIRST_LC(c) (isalpha((unsigned char)(c)) || (char)(c) == '_') # define isALPHA_LC(c) isalpha((unsigned char)(c)) # define isSPACE_LC(c) isspace((unsigned char)(c)) # define isDIGIT_LC(c) isdigit((unsigned char)(c)) # define isUPPER_LC(c) isupper((unsigned char)(c)) # define isLOWER_LC(c) islower((unsigned char)(c)) +# define isALNUMC_LC(c) isalnum((unsigned char)(c)) +# define isCNTRL_LC(c) iscntrl((unsigned char)(c)) +# define isGRAPH_LC(c) isgraph((unsigned char)(c)) # define isPRINT_LC(c) isprint((unsigned char)(c)) +# define isPUNCT_LC(c) ispunct((unsigned char)(c)) # define toUPPER_LC(c) toupper((unsigned char)(c)) # define toLOWER_LC(c) tolower((unsigned char)(c)) # else -# define isALNUM_LC(c) \ - (isascii(c) && (isalpha(c) || isdigit(c) || (c) == '_')) +# define isALNUM_LC(c) (isascii(c) && (isalnum(c) || (c) == '_')) # define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_')) # define isALPHA_LC(c) (isascii(c) && isalpha(c)) # define isSPACE_LC(c) (isascii(c) && isspace(c)) # define isDIGIT_LC(c) (isascii(c) && isdigit(c)) # define isUPPER_LC(c) (isascii(c) && isupper(c)) # define isLOWER_LC(c) (isascii(c) && islower(c)) +# define isALNUMC_LC(c) (isascii(c) && isalnum(c)) +# define isCNTRL_LC(c) (isascii(c) && iscntrl(c)) +# define isGRAPH_LC(c) (isascii(c) && isgraph(c)) # define isPRINT_LC(c) (isascii(c) && isprint(c)) +# define isPUNCT_LC(c) (isascii(c) && ispunct(c)) # define toUPPER_LC(c) toupper(c) # define toLOWER_LC(c) tolower(c) @@ -282,7 +303,13 @@ typedef unsigned short U16; #define isDIGIT_uni(c) is_uni_digit(c) #define isUPPER_uni(c) is_uni_upper(c) #define isLOWER_uni(c) is_uni_lower(c) +#define isALNUMC_uni(c) is_uni_alnumc(c) +#define isASCII_uni(c) is_uni_ascii(c) +#define isCNTRL_uni(c) is_uni_cntrl(c) +#define isGRAPH_uni(c) is_uni_graph(c) #define isPRINT_uni(c) is_uni_print(c) +#define isPUNCT_uni(c) is_uni_punct(c) +#define isXDIGIT_uni(c) is_uni_xdigit(c) #define toUPPER_uni(c) to_uni_upper(c) #define toTITLE_uni(c) to_uni_title(c) #define toLOWER_uni(c) to_uni_lower(c) @@ -294,7 +321,11 @@ typedef unsigned short U16; #define isDIGIT_LC_uni(c) (c < 256 ? isDIGIT_LC(c) : is_uni_digit_lc(c)) #define isUPPER_LC_uni(c) (c < 256 ? isUPPER_LC(c) : is_uni_upper_lc(c)) #define isLOWER_LC_uni(c) (c < 256 ? isLOWER_LC(c) : is_uni_lower_lc(c)) +#define isALNUMC_LC_uni(c) (c < 256 ? isALNUMC_LC(c) : is_uni_alnumc_lc(c)) +#define isCNTRL_LC_uni(c) (c < 256 ? isCNTRL_LC(c) : is_uni_cntrl_lc(c)) +#define isGRAPH_LC_uni(c) (c < 256 ? isGRAPH_LC(c) : is_uni_graph_lc(c)) #define isPRINT_LC_uni(c) (c < 256 ? isPRINT_LC(c) : is_uni_print_lc(c)) +#define isPUNCT_LC_uni(c) (c < 256 ? isPUNCT_LC(c) : is_uni_punct_lc(c)) #define toUPPER_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_upper_lc(c)) #define toTITLE_LC_uni(c) (c < 256 ? toUPPER_LC(c) : to_uni_title_lc(c)) #define toLOWER_LC_uni(c) (c < 256 ? toLOWER_LC(c) : to_uni_lower_lc(c)) @@ -306,7 +337,13 @@ typedef unsigned short U16; #define isDIGIT_utf8(p) is_utf8_digit(p) #define isUPPER_utf8(p) is_utf8_upper(p) #define isLOWER_utf8(p) is_utf8_lower(p) +#define isALNUMC_utf8(p) is_utf8_alnumc(p) +#define isASCII_utf8(p) is_utf8_ascii(p) +#define isCNTRL_utf8(p) is_utf8_cntrl(p) +#define isGRAPH_utf8(p) is_utf8_graph(p) #define isPRINT_utf8(p) is_utf8_print(p) +#define isPUNCT_utf8(p) is_utf8_punct(p) +#define isXDIGIT_utf8(p) is_utf8_xdigit(p) #define toUPPER_utf8(p) to_utf8_upper(p) #define toTITLE_utf8(p) to_utf8_title(p) #define toLOWER_utf8(p) to_utf8_lower(p) @@ -318,7 +355,11 @@ typedef unsigned short U16; #define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, 0)) #define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, 0)) #define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, 0)) +#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, 0)) +#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, 0)) +#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, 0)) #define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, 0)) +#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, 0)) #define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, 0)) #define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0)) #define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0)) diff --git a/intrpvar.h b/intrpvar.h index 5cff858..e2d1d5f 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -322,12 +322,18 @@ PERLVAR(Inumeric_radix, char) /* utf8 character classes */ PERLVAR(Iutf8_alnum, SV *) +PERLVAR(Iutf8_alnumc, SV *) +PERLVAR(Iutf8_ascii, SV *) PERLVAR(Iutf8_alpha, SV *) PERLVAR(Iutf8_space, SV *) +PERLVAR(Iutf8_cntrl, SV *) +PERLVAR(Iutf8_graph, SV *) PERLVAR(Iutf8_digit, SV *) PERLVAR(Iutf8_upper, SV *) PERLVAR(Iutf8_lower, SV *) PERLVAR(Iutf8_print, SV *) +PERLVAR(Iutf8_punct, SV *) +PERLVAR(Iutf8_xdigit, SV *) PERLVAR(Iutf8_mark, SV *) PERLVAR(Iutf8_toupper, SV *) PERLVAR(Iutf8_totitle, SV *) diff --git a/lib/unicode/Bidirectional.pl b/lib/unicode/Bidirectional.pl index 1523d50..e10210d 100644 --- a/lib/unicode/Bidirectional.pl +++ b/lib/unicode/Bidirectional.pl @@ -233,6 +233,32 @@ return <<'END'; 1100 1159 L 115f 11a2 L 11a8 11f9 L +1200 1206 L +1208 1246 L +1248 L +124a 124d L +1250 1256 L +1258 L +125a 125d L +1260 1286 L +1288 L +128a 128d L +1290 12ae L +12b0 L +12b2 12b5 L +12b8 12be L +12c0 L +12c2 12c5 L +12c8 12ce L +12d0 12d6 L +12d8 12ee L +12f0 130e L +1310 L +1312 1315 L +1318 131e L +1320 1346 L +1348 135a L +1361 137c L 1e00 1e9b L 1ea0 1ef9 L 1f00 1f15 L diff --git a/lib/unicode/Block.pl b/lib/unicode/Block.pl index ce9289a..1c0b280 100644 --- a/lib/unicode/Block.pl +++ b/lib/unicode/Block.pl @@ -27,7 +27,6 @@ return <<'END'; 1100 11FF Hangul Jamo 1E00 1EFF Latin Extended Additional 1F00 1FFF Greek Extended -1200 137F Ethiopic 2000 206F General Punctuation 2070 209F Superscripts and Subscripts 20A0 20CF Currency Symbols diff --git a/lib/unicode/Category.pl b/lib/unicode/Category.pl index 3b47570..5c0842c 100644 --- a/lib/unicode/Category.pl +++ b/lib/unicode/Category.pl @@ -762,7 +762,31 @@ return <<'END'; 1100 1159 Lo 115f 11a2 Lo 11a8 11f9 Lo -1200 135a Lo +1200 1206 Lo +1208 1246 Lo +1248 Lo +124a 124d Lo +1250 1256 Lo +1258 Lo +125a 125d Lo +1260 1286 Lo +1288 Lo +128a 128d Lo +1290 12ae Lo +12b0 Lo +12b2 12b5 Lo +12b8 12be Lo +12c0 Lo +12c2 12c5 Lo +12c8 12ce Lo +12d0 12d6 Lo +12d8 12ee Lo +12f0 130e Lo +1310 Lo +1312 1315 Lo +1318 131e Lo +1320 1346 Lo +1348 135a Lo 1361 1368 Po 1369 1371 Nd 1372 137c No diff --git a/lib/unicode/Is/ASCII.pl b/lib/unicode/Is/ASCII.pl new file mode 100644 index 0000000..b7843e9 --- /dev/null +++ b/lib/unicode/Is/ASCII.pl @@ -0,0 +1,3 @@ +return <<'END'; +0000 007f +END diff --git a/lib/unicode/Is/Alnum.pl b/lib/unicode/Is/Alnum.pl index aa82e4f..ac48257 100644 --- a/lib/unicode/Is/Alnum.pl +++ b/lib/unicode/Is/Alnum.pl @@ -1,7 +1,6 @@ return <<'END'; 0030 0039 0041 005a -005f 0061 007a 00aa 00b5 @@ -156,8 +155,32 @@ return <<'END'; 1100 1159 115f 11a2 11a8 11f9 -1200 135a -1369 137c +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a +1369 1371 1e00 1e9b 1ea0 1ef9 1f00 1f15 diff --git a/lib/unicode/Is/Alpha.pl b/lib/unicode/Is/Alpha.pl index ea6fa7f..9de0521 100644 --- a/lib/unicode/Is/Alpha.pl +++ b/lib/unicode/Is/Alpha.pl @@ -141,7 +141,31 @@ return <<'END'; 1100 1159 115f 11a2 11a8 11f9 -1200 135a +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a 1e00 1e9b 1ea0 1ef9 1f00 1f15 diff --git a/lib/unicode/Is/BidiL.pl b/lib/unicode/Is/BidiL.pl index 85de325..c17ef10 100644 --- a/lib/unicode/Is/BidiL.pl +++ b/lib/unicode/Is/BidiL.pl @@ -186,6 +186,32 @@ return <<'END'; 1100 1159 115f 11a2 11a8 11f9 +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a +1361 137c 1e00 1e9b 1ea0 1ef9 1f00 1f15 diff --git a/lib/unicode/Is/Cntrl.pl b/lib/unicode/Is/Cntrl.pl new file mode 100644 index 0000000..274239f --- /dev/null +++ b/lib/unicode/Is/Cntrl.pl @@ -0,0 +1,12 @@ +return <<'END'; +0000 001f +007f 009f +200c 200f +202a 202e +206a 206f +d800 db7f +db80 dbff +dc00 dfff +e000 f8ff +feff +END diff --git a/lib/unicode/Is/Digit.pl b/lib/unicode/Is/Digit.pl index 2181f15..a25e28f 100644 --- a/lib/unicode/Is/Digit.pl +++ b/lib/unicode/Is/Digit.pl @@ -14,6 +14,6 @@ return <<'END'; 0e50 0e59 0ed0 0ed9 0f20 0f29 -1369 137c +1369 1371 ff10 ff19 END diff --git a/lib/unicode/Is/Graph.pl b/lib/unicode/Is/Graph.pl new file mode 100644 index 0000000..7a8c225 --- /dev/null +++ b/lib/unicode/Is/Graph.pl @@ -0,0 +1,327 @@ +return <<'END'; +0021 007e +00a0 01f5 +01fa 0217 +0250 02a8 +02b0 02de +02e0 02e9 +0300 0345 +0360 0361 +0374 0375 +037a +037e +0384 038a +038c +038e 03a1 +03a3 03ce +03d0 03d6 +03da +03dc +03de +03e0 +03e2 03f3 +0401 040c +040e 044f +0451 045c +045e 0486 +0490 04c4 +04c7 04c8 +04cb 04cc +04d0 04eb +04ee 04f5 +04f8 04f9 +0531 0556 +0559 055f +0561 0587 +0589 +0591 05a1 +05a3 05b9 +05bb 05c4 +05d0 05ea +05f0 05f4 +060c +061b +061f +0621 063a +0640 0652 +0660 066d +0670 06b7 +06ba 06be +06c0 06ce +06d0 06ed +06f0 06f9 +0901 0903 +0905 0939 +093c 094d +0950 0954 +0958 0970 +0981 0983 +0985 098c +098f 0990 +0993 09a8 +09aa 09b0 +09b2 +09b6 09b9 +09bc +09be 09c4 +09c7 09c8 +09cb 09cd +09d7 +09dc 09dd +09df 09e3 +09e6 09fa +0a02 +0a05 0a0a +0a0f 0a10 +0a13 0a28 +0a2a 0a30 +0a32 0a33 +0a35 0a36 +0a38 0a39 +0a3c +0a3e 0a42 +0a47 0a48 +0a4b 0a4d +0a59 0a5c +0a5e +0a66 0a74 +0a81 0a83 +0a85 0a8b +0a8d +0a8f 0a91 +0a93 0aa8 +0aaa 0ab0 +0ab2 0ab3 +0ab5 0ab9 +0abc 0ac5 +0ac7 0ac9 +0acb 0acd +0ad0 +0ae0 +0ae6 0aef +0b01 0b03 +0b05 0b0c +0b0f 0b10 +0b13 0b28 +0b2a 0b30 +0b32 0b33 +0b36 0b39 +0b3c 0b43 +0b47 0b48 +0b4b 0b4d +0b56 0b57 +0b5c 0b5d +0b5f 0b61 +0b66 0b70 +0b82 0b83 +0b85 0b8a +0b8e 0b90 +0b92 0b95 +0b99 0b9a +0b9c +0b9e 0b9f +0ba3 0ba4 +0ba8 0baa +0bae 0bb5 +0bb7 0bb9 +0bbe 0bc2 +0bc6 0bc8 +0bca 0bcd +0bd7 +0be7 0bf2 +0c01 0c03 +0c05 0c0c +0c0e 0c10 +0c12 0c28 +0c2a 0c33 +0c35 0c39 +0c3e 0c44 +0c46 0c48 +0c4a 0c4d +0c55 0c56 +0c60 0c61 +0c66 0c6f +0c82 0c83 +0c85 0c8c +0c8e 0c90 +0c92 0ca8 +0caa 0cb3 +0cb5 0cb9 +0cbe 0cc4 +0cc6 0cc8 +0cca 0ccd +0cd5 0cd6 +0cde +0ce0 0ce1 +0ce6 0cef +0d02 0d03 +0d05 0d0c +0d0e 0d10 +0d12 0d28 +0d2a 0d39 +0d3e 0d43 +0d46 0d48 +0d4a 0d4d +0d57 +0d60 0d61 +0d66 0d6f +0e01 0e3a +0e3f 0e5b +0e81 0e82 +0e84 +0e87 0e88 +0e8a +0e8d +0e94 0e97 +0e99 0e9f +0ea1 0ea3 +0ea5 +0ea7 +0eaa 0eab +0ead 0eb9 +0ebb 0ebd +0ec0 0ec4 +0ec6 +0ec8 0ecd +0ed0 0ed9 +0edc 0edd +0f00 0f47 +0f49 0f69 +0f71 0f8b +0f90 0f95 +0f97 +0f99 0fad +0fb1 0fb7 +0fb9 +10a0 10c5 +10d0 10f6 +10fb +1100 1159 +115f 11a2 +11a8 11f9 +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a +1361 137c +1e00 1e9b +1ea0 1ef9 +1f00 1f15 +1f18 1f1d +1f20 1f45 +1f48 1f4d +1f50 1f57 +1f59 +1f5b +1f5d +1f5f 1f7d +1f80 1fb4 +1fb6 1fc4 +1fc6 1fd3 +1fd6 1fdb +1fdd 1fef +1ff2 1ff4 +1ff6 1ffe +2000 200b +2010 2029 +2030 2046 +2070 +2074 208e +20a0 20ac +20d0 20e1 +2100 2138 +2153 2182 +2190 21ea +2200 22f1 +2300 +2302 237a +2400 2424 +2440 244a +2460 24ea +2500 2595 +25a0 25ef +2600 2613 +261a 266f +2701 2704 +2706 2709 +270c 2727 +2729 274b +274d +274f 2752 +2756 +2758 275e +2761 2767 +2776 2794 +2798 27af +27b1 27be +3000 3037 +303f +3041 3094 +3099 309e +30a1 30fe +3105 312c +3131 318e +3190 319f +3200 321c +3220 3243 +3260 327b +327f 32b0 +32c0 32cb +32d0 32fe +3300 3376 +337b 33dd +33e0 33fe +4e00 9fa5 +ac00 d7a3 +f900 fa2d +fb00 fb06 +fb13 fb17 +fb1e fb36 +fb38 fb3c +fb3e +fb40 fb41 +fb43 fb44 +fb46 fbb1 +fbd3 fd3f +fd50 fd8f +fd92 fdc7 +fdf0 fdfb +fe20 fe23 +fe30 fe44 +fe49 fe52 +fe54 fe66 +fe68 fe6b +fe70 fe72 +fe74 +fe76 fefc +ff01 ff5e +ff61 ffbe +ffc2 ffc7 +ffca ffcf +ffd2 ffd7 +ffda ffdc +ffe0 ffe6 +ffe8 ffee +fffc fffd +END diff --git a/lib/unicode/Is/L.pl b/lib/unicode/Is/L.pl index 9c8e3cf..06796fd 100644 --- a/lib/unicode/Is/L.pl +++ b/lib/unicode/Is/L.pl @@ -145,7 +145,31 @@ return <<'END'; 1100 1159 115f 11a2 11a8 11f9 -1200 135a +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a 1e00 1e9b 1ea0 1ef9 1f00 1f15 diff --git a/lib/unicode/Is/Lo.pl b/lib/unicode/Is/Lo.pl index 30f776d..07da29e 100644 --- a/lib/unicode/Is/Lo.pl +++ b/lib/unicode/Is/Lo.pl @@ -107,7 +107,31 @@ return <<'END'; 1100 1159 115f 11a2 11a8 11f9 -1200 135a +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a 2135 2138 3041 3094 30a1 30fa diff --git a/lib/unicode/Is/Print.pl b/lib/unicode/Is/Print.pl index eef2d31..c2e6070 100644 --- a/lib/unicode/Is/Print.pl +++ b/lib/unicode/Is/Print.pl @@ -199,6 +199,32 @@ return <<'END'; 1100 1159 115f 11a2 11a8 11f9 +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a +1361 137c 1e00 1e9b 1ea0 1ef9 1f00 1f15 diff --git a/lib/unicode/Is/Punct.pl b/lib/unicode/Is/Punct.pl new file mode 100644 index 0000000..0d52205 --- /dev/null +++ b/lib/unicode/Is/Punct.pl @@ -0,0 +1,70 @@ +return <<'END'; +0021 0023 +0025 002a +002c 002f +003a 003b +003f 0040 +005b 005d +005f +007b +007d +00a1 +00ab +00ad +00b7 +00bb +00bf +0374 0375 +037e +0387 +055a 055f +0589 +05be +05c0 +05c3 +05f3 05f4 +060c +061b +061f +066a 066d +06d4 +0964 0965 +0970 +0e2f +0e5a 0e5b +0eaf +0f04 0f12 +0f3a 0f3d +0f85 +10fb +1361 1368 +2010 2027 +2030 2043 +2045 2046 +207d 207e +208d 208e +2329 232a +3001 3003 +3006 +3008 3011 +3014 301f +3030 +30fb +fd3e fd3f +fe30 fe44 +fe49 fe52 +fe54 fe61 +fe63 +fe68 +fe6a fe6b +ff01 ff03 +ff05 ff0a +ff0c ff0f +ff1a ff1b +ff1f ff20 +ff3b ff3d +ff3f +ff5b +ff5d +ff61 ff65 +END diff --git a/lib/unicode/Is/Space.pl b/lib/unicode/Is/Space.pl index 903f854..715afc3 100644 --- a/lib/unicode/Is/Space.pl +++ b/lib/unicode/Is/Space.pl @@ -3,7 +3,6 @@ return <<'END'; 000c 000d 0020 00a0 -1361 2000 200b 2028 2029 3000 diff --git a/lib/unicode/Is/Word.pl b/lib/unicode/Is/Word.pl new file mode 100644 index 0000000..6a30246 --- /dev/null +++ b/lib/unicode/Is/Word.pl @@ -0,0 +1,250 @@ +return <<'END'; +0030 0039 +0041 005a +005f +0061 007a +00aa +00b5 +00ba +00c0 00d6 +00d8 00f6 +00f8 01c4 +01c6 01c7 +01c9 01ca +01cc 01f1 +01f3 01f5 +01fa 0217 +0250 02a8 +0386 +0388 038a +038c +038e 03a1 +03a3 03ce +03d0 03d6 +03da +03dc +03de +03e0 +03e2 03f3 +0401 040c +040e 044f +0451 045c +045e 0481 +0490 04c4 +04c7 04c8 +04cb 04cc +04d0 04eb +04ee 04f5 +04f8 04f9 +0531 0556 +0561 0587 +05d0 05ea +05f0 05f2 +0621 063a +0641 064a +0660 0669 +0671 06b7 +06ba 06be +06c0 06ce +06d0 06d3 +06d5 +06f0 06f9 +0905 0939 +093d +0958 0961 +0966 096f +0985 098c +098f 0990 +0993 09a8 +09aa 09b0 +09b2 +09b6 09b9 +09dc 09dd +09df 09e1 +09e6 09f1 +0a05 0a0a +0a0f 0a10 +0a13 0a28 +0a2a 0a30 +0a32 0a33 +0a35 0a36 +0a38 0a39 +0a59 0a5c +0a5e +0a66 0a6f +0a72 0a74 +0a85 0a8b +0a8d +0a8f 0a91 +0a93 0aa8 +0aaa 0ab0 +0ab2 0ab3 +0ab5 0ab9 +0abd +0ae0 +0ae6 0aef +0b05 0b0c +0b0f 0b10 +0b13 0b28 +0b2a 0b30 +0b32 0b33 +0b36 0b39 +0b3d +0b5c 0b5d +0b5f 0b61 +0b66 0b6f +0b85 0b8a +0b8e 0b90 +0b92 0b95 +0b99 0b9a +0b9c +0b9e 0b9f +0ba3 0ba4 +0ba8 0baa +0bae 0bb5 +0bb7 0bb9 +0be7 0bef +0c05 0c0c +0c0e 0c10 +0c12 0c28 +0c2a 0c33 +0c35 0c39 +0c60 0c61 +0c66 0c6f +0c85 0c8c +0c8e 0c90 +0c92 0ca8 +0caa 0cb3 +0cb5 0cb9 +0cde +0ce0 0ce1 +0ce6 0cef +0d05 0d0c +0d0e 0d10 +0d12 0d28 +0d2a 0d39 +0d60 0d61 +0d66 0d6f +0e01 0e2e +0e30 +0e32 0e33 +0e40 0e45 +0e50 0e59 +0e81 0e82 +0e84 +0e87 0e88 +0e8a +0e8d +0e94 0e97 +0e99 0e9f +0ea1 0ea3 +0ea5 +0ea7 +0eaa 0eab +0ead 0eae +0eb0 +0eb2 0eb3 +0ebd +0ec0 0ec4 +0ed0 0ed9 +0edc 0edd +0f20 0f29 +0f40 0f47 +0f49 0f69 +10a0 10c5 +10d0 10f6 +1100 1159 +115f 11a2 +11a8 11f9 +1200 1206 +1208 1246 +1248 +124a 124d +1250 1256 +1258 +125a 125d +1260 1286 +1288 +128a 128d +1290 12ae +12b0 +12b2 12b5 +12b8 12be +12c0 +12c2 12c5 +12c8 12ce +12d0 12d6 +12d8 12ee +12f0 130e +1310 +1312 1315 +1318 131e +1320 1346 +1348 135a +1369 1371 +1e00 1e9b +1ea0 1ef9 +1f00 1f15 +1f18 1f1d +1f20 1f45 +1f48 1f4d +1f50 1f57 +1f59 +1f5b +1f5d +1f5f 1f7d +1f80 1fb4 +1fb6 1fbc +1fbe +1fc2 1fc4 +1fc6 1fcc +1fd0 1fd3 +1fd6 1fdb +1fe0 1fec +1ff2 1ff4 +1ff6 1ffc +207f +2102 +2107 +210a 2113 +2115 +2118 211d +2124 +2126 +2128 +212a 2131 +2133 2138 +3041 3094 +30a1 30fa +3105 312c +3131 318e +4e00 9fa5 +ac00 d7a3 +f900 fa2d +fb00 fb06 +fb13 fb17 +fb1f fb28 +fb2a fb36 +fb38 fb3c +fb3e +fb40 fb41 +fb43 fb44 +fb46 fbb1 +fbd3 fd3d +fd50 fd8f +fd92 fdc7 +fdf0 fdfb +fe70 fe72 +fe74 +fe76 fefc +ff10 ff19 +ff21 ff3a +ff41 ff5a +ff66 ff6f +ff71 ff9d +ffa0 ffbe +ffc2 ffc7 +ffca ffcf +ffd2 ffd7 +ffda ffdc +END diff --git a/lib/unicode/Is/XDigit.pl b/lib/unicode/Is/XDigit.pl new file mode 100644 index 0000000..f0b7044 --- /dev/null +++ b/lib/unicode/Is/XDigit.pl @@ -0,0 +1,5 @@ +return <<'END'; +0030 0039 +0041 0046 +0061 0066 +END diff --git a/lib/unicode/Is/Z.pl b/lib/unicode/Is/Z.pl index af595da..9e83d94 100644 --- a/lib/unicode/Is/Z.pl +++ b/lib/unicode/Is/Z.pl @@ -1,7 +1,6 @@ return <<'END'; 0020 00a0 -1361 2000 200b 2028 2029 3000 diff --git a/lib/unicode/Is/Zs.pl b/lib/unicode/Is/Zs.pl index 403728c..87d4455 100644 --- a/lib/unicode/Is/Zs.pl +++ b/lib/unicode/Is/Zs.pl @@ -1,7 +1,6 @@ return <<'END'; 0020 00a0 -1361 2000 200b 3000 END diff --git a/lib/unicode/Name.pl b/lib/unicode/Name.pl index 0925bad..45099ac 100644 --- a/lib/unicode/Name.pl +++ b/lib/unicode/Name.pl @@ -2740,352 +2740,351 @@ return <<'END'; 11f7 HANGUL JONGSEONG HIEUH-MIEUM 11f8 HANGUL JONGSEONG HIEUH-PIEUP 11f9 HANGUL JONGSEONG YEORINHIEUH -1200 ETHIOPIC SYLLABLE HA -1201 ETHIOPIC SYLLABLE HU -1202 ETHIOPIC SYLLABLE HI -1203 ETHIOPIC SYLLABLE HAA -1204 ETHIOPIC SYLLABLE HEE -1205 ETHIOPIC SYLLABLE HE -1206 ETHIOPIC SYLLABLE HO -1208 ETHIOPIC SYLLABLE LA -1209 ETHIOPIC SYLLABLE LU -120A ETHIOPIC SYLLABLE LI -120B ETHIOPIC SYLLABLE LAA -120C ETHIOPIC SYLLABLE LEE -120D ETHIOPIC SYLLABLE LE -120E ETHIOPIC SYLLABLE LO -120F ETHIOPIC SYLLABLE LWA -1210 ETHIOPIC SYLLABLE HHA -1211 ETHIOPIC SYLLABLE HHU -1212 ETHIOPIC SYLLABLE HHI -1213 ETHIOPIC SYLLABLE HHAA -1214 ETHIOPIC SYLLABLE HHEE -1215 ETHIOPIC SYLLABLE HHE -1217 ETHIOPIC SYLLABLE HHWA -1218 ETHIOPIC SYLLABLE MA -1219 ETHIOPIC SYLLABLE MU -121A ETHIOPIC SYLLABLE MI -121B ETHIOPIC SYLLABLE MAA -121C ETHIOPIC SYLLABLE MEE -121D ETHIOPIC SYLLABLE ME -121E ETHIOPIC SYLLABLE MO -121F ETHIOPIC SYLLABLE MWAA -1220 ETHIOPIC SYLLABLE SZA -1221 ETHIOPIC SYLLABLE SZU -1222 ETHIOPIC SYLLABLE SZI -1223 ETHIOPIC SYLLABLE SZAA -1224 ETHIOPIC SYLLABLE SZEE -1225 ETHIOPIC SYLLABLE SZE -1226 ETHIOPIC SYLLABLE SZO -1227 ETHIOPIC SYLLABLE SZWA -1228 ETHIOPIC SYLLABLE RA -1229 ETHIOPIC SYLLABLE RU -122A ETHIOPIC SYLLABLE RI -122B ETHIOPIC SYLLABLE RAA -122C ETHIOPIC SYLLABLE REE -122D ETHIOPIC SYLLABLE RE -122E ETHIOPIC SYLLABLE RO -122F ETHIOPIC SYLLABLE RWA -1230 ETHIOPIC SYLLABLE SA -1231 ETHIOPIC SYLLABLE SU -1232 ETHIOPIC SYLLABLE SI -1233 ETHIOPIC SYLLABLE SAA -1234 ETHIOPIC SYLLABLE SEE -1235 ETHIOPIC SYLLABLE SE -1236 ETHIOPIC SYLLABLE SO -1237 ETHIOPIC SYLLABLE SWA -1238 ETHIOPIC SYLLABLE SHA -1239 ETHIOPIC SYLLABLE SHU -123A ETHIOPIC SYLLABLE SHI -123B ETHIOPIC SYLLABLE SHAA -123C ETHIOPIC SYLLABLE SHEE -123D ETHIOPIC SYLLABLE SHE -123E ETHIOPIC SYLLABLE SHO -123F ETHIOPIC SYLLABLE SHWA -1240 ETHIOPIC SYLLABLE QA -1241 ETHIOPIC SYLLABLE QU -1242 ETHIOPIC SYLLABLE QI -1243 ETHIOPIC SYLLABLE QAA -1244 ETHIOPIC SYLLABLE QEE -1245 ETHIOPIC SYLLABLE QE -1246 ETHIOPIC SYLLABLE QO -1248 ETHIOPIC SYLLABLE QWA -124A ETHIOPIC SYLLABLE QWI -124B ETHIOPIC SYLLABLE QWAA -124C ETHIOPIC SYLLABLE QWEE -124D ETHIOPIC SYLLABLE QWE -1250 ETHIOPIC SYLLABLE QHA -1251 ETHIOPIC SYLLABLE QHU -1252 ETHIOPIC SYLLABLE QHI -1253 ETHIOPIC SYLLABLE QHAA -1254 ETHIOPIC SYLLABLE QHEE -1255 ETHIOPIC SYLLABLE QHE -1256 ETHIOPIC SYLLABLE QHO -1258 ETHIOPIC SYLLABLE QHWA -125A ETHIOPIC SYLLABLE QHWI -125B ETHIOPIC SYLLABLE QHWAA -125C ETHIOPIC SYLLABLE QHWEE -125D ETHIOPIC SYLLABLE QHWE -1260 ETHIOPIC SYLLABLE BA -1261 ETHIOPIC SYLLABLE BU -1262 ETHIOPIC SYLLABLE BI -1263 ETHIOPIC SYLLABLE BAA -1264 ETHIOPIC SYLLABLE BEE -1265 ETHIOPIC SYLLABLE BE -1266 ETHIOPIC SYLLABLE BO -1267 ETHIOPIC SYLLABLE BWAA -1268 ETHIOPIC SYLLABLE VA -1269 ETHIOPIC SYLLABLE VU -126A ETHIOPIC SYLLABLE VI -126B ETHIOPIC SYLLABLE VAA -126C ETHIOPIC SYLLABLE VEE -126D ETHIOPIC SYLLABLE VE -126E ETHIOPIC SYLLABLE VO -126F ETHIOPIC SYLLABLE VWA -1270 ETHIOPIC SYLLABLE TA -1271 ETHIOPIC SYLLABLE TU -1272 ETHIOPIC SYLLABLE TI -1273 ETHIOPIC SYLLABLE TAA -1274 ETHIOPIC SYLLABLE TEE -1275 ETHIOPIC SYLLABLE TE -1276 ETHIOPIC SYLLABLE TO -1277 ETHIOPIC SYLLABLE TWA -1278 ETHIOPIC SYLLABLE CA -1279 ETHIOPIC SYLLABLE CU -127A ETHIOPIC SYLLABLE CI -127B ETHIOPIC SYLLABLE CAA -127C ETHIOPIC SYLLABLE CEE -127D ETHIOPIC SYLLABLE CE -127E ETHIOPIC SYLLABLE CO -127F ETHIOPIC SYLLABLE CWA -1280 ETHIOPIC SYLLABLE XA -1281 ETHIOPIC SYLLABLE XU -1282 ETHIOPIC SYLLABLE XI -1283 ETHIOPIC SYLLABLE XAA -1284 ETHIOPIC SYLLABLE XEE -1285 ETHIOPIC SYLLABLE XE -1286 ETHIOPIC SYLLABLE XO -1288 ETHIOPIC SYLLABLE XWA -128A ETHIOPIC SYLLABLE XWI -128B ETHIOPIC SYLLABLE XWAA -128C ETHIOPIC SYLLABLE XWEE -128D ETHIOPIC SYLLABLE XWE -1290 ETHIOPIC SYLLABLE NA -1291 ETHIOPIC SYLLABLE NU -1292 ETHIOPIC SYLLABLE NI -1293 ETHIOPIC SYLLABLE NAA -1294 ETHIOPIC SYLLABLE NEE -1295 ETHIOPIC SYLLABLE NE -1296 ETHIOPIC SYLLABLE NO -1297 ETHIOPIC SYLLABLE NWA -1298 ETHIOPIC SYLLABLE NYA -1299 ETHIOPIC SYLLABLE NYU -129A ETHIOPIC SYLLABLE NYI -129B ETHIOPIC SYLLABLE NYAA -129C ETHIOPIC SYLLABLE NYEE -129D ETHIOPIC SYLLABLE NYE -129E ETHIOPIC SYLLABLE NYO -129F ETHIOPIC SYLLABLE NYWA -12A0 ETHIOPIC SYLLABLE GLOTTAL A -12A1 ETHIOPIC SYLLABLE GLOTTAL U -12A2 ETHIOPIC SYLLABLE GLOTTAL I -12A3 ETHIOPIC SYLLABLE GLOTTAL AA -12A4 ETHIOPIC SYLLABLE GLOTTAL EE -12A5 ETHIOPIC SYLLABLE GLOTTAL E -12A6 ETHIOPIC SYLLABLE GLOTTAL O -12A7 ETHIOPIC SYLLABLE GLOTTAL WA -12A8 ETHIOPIC SYLLABLE KA -12A9 ETHIOPIC SYLLABLE KU -12AA ETHIOPIC SYLLABLE KI -12AB ETHIOPIC SYLLABLE KAA -12AC ETHIOPIC SYLLABLE KEE -12AD ETHIOPIC SYLLABLE KE -12AE ETHIOPIC SYLLABLE KO -12B0 ETHIOPIC SYLLABLE KWA -12B2 ETHIOPIC SYLLABLE KWI -12B3 ETHIOPIC SYLLABLE KWAA -12B4 ETHIOPIC SYLLABLE KWEE -12B5 ETHIOPIC SYLLABLE KWE -12B8 ETHIOPIC SYLLABLE KXA -12B9 ETHIOPIC SYLLABLE KXU -12BA ETHIOPIC SYLLABLE KXI -12BB ETHIOPIC SYLLABLE KXAA -12BC ETHIOPIC SYLLABLE KXEE -12BD ETHIOPIC SYLLABLE KXE -12BE ETHIOPIC SYLLABLE KXO -12C0 ETHIOPIC SYLLABLE KXWA -12C2 ETHIOPIC SYLLABLE KXWI -12C3 ETHIOPIC SYLLABLE KXWAA -12C4 ETHIOPIC SYLLABLE KXWEE -12C5 ETHIOPIC SYLLABLE KXWE -12C8 ETHIOPIC SYLLABLE WA -12C9 ETHIOPIC SYLLABLE WU -12CA ETHIOPIC SYLLABLE WI -12CB ETHIOPIC SYLLABLE WAA -12CC ETHIOPIC SYLLABLE WEE -12CD ETHIOPIC SYLLABLE WE -12CE ETHIOPIC SYLLABLE WO -12D0 ETHIOPIC SYLLABLE PHARYNGEAL A -12D1 ETHIOPIC SYLLABLE PHARYNGEAL U -12D2 ETHIOPIC SYLLABLE PHARYNGEAL I -12D3 ETHIOPIC SYLLABLE PHARYNGEAL AA -12D4 ETHIOPIC SYLLABLE PHARYNGEAL EE -12D5 ETHIOPIC SYLLABLE PHARYNGEAL E -12D6 ETHIOPIC SYLLABLE PHARYNGEAL O -12D8 ETHIOPIC SYLLABLE ZA -12D9 ETHIOPIC SYLLABLE ZU -12DA ETHIOPIC SYLLABLE ZI -12DB ETHIOPIC SYLLABLE ZAA -12DC ETHIOPIC SYLLABLE ZEE -12DD ETHIOPIC SYLLABLE ZE -12DE ETHIOPIC SYLLABLE ZO -12DF ETHIOPIC SYLLABLE ZWA -12E0 ETHIOPIC SYLLABLE ZHA -12E1 ETHIOPIC SYLLABLE ZHU -12E2 ETHIOPIC SYLLABLE ZHI -12E3 ETHIOPIC SYLLABLE ZHAA -12E4 ETHIOPIC SYLLABLE ZHEE -12E5 ETHIOPIC SYLLABLE ZHE -12E6 ETHIOPIC SYLLABLE ZHO -12E7 ETHIOPIC SYLLABLE ZHWA -12E8 ETHIOPIC SYLLABLE YA -12E9 ETHIOPIC SYLLABLE YU -12EA ETHIOPIC SYLLABLE YI -12EB ETHIOPIC SYLLABLE YAA -12EC ETHIOPIC SYLLABLE YEE -12ED ETHIOPIC SYLLABLE YE -12EE ETHIOPIC SYLLABLE YO -12EF ETHIOPIC SYLLABLE YWA -12F0 ETHIOPIC SYLLABLE DA -12F1 ETHIOPIC SYLLABLE DU -12F2 ETHIOPIC SYLLABLE DI -12F3 ETHIOPIC SYLLABLE DAA -12F4 ETHIOPIC SYLLABLE DEE -12F5 ETHIOPIC SYLLABLE DE -12F6 ETHIOPIC SYLLABLE DO -12F7 ETHIOPIC SYLLABLE DWA -12F8 ETHIOPIC SYLLABLE DDA -12F9 ETHIOPIC SYLLABLE DDU -12FA ETHIOPIC SYLLABLE DDI -12FB ETHIOPIC SYLLABLE DDAA -12FC ETHIOPIC SYLLABLE DDEE -12FD ETHIOPIC SYLLABLE DDE -12FE ETHIOPIC SYLLABLE DDO -12FF ETHIOPIC SYLLABLE DDWA -1300 ETHIOPIC SYLLABLE JA -1301 ETHIOPIC SYLLABLE JU -1302 ETHIOPIC SYLLABLE JI -1303 ETHIOPIC SYLLABLE JAA -1304 ETHIOPIC SYLLABLE JEE -1305 ETHIOPIC SYLLABLE JE -1306 ETHIOPIC SYLLABLE JO -1307 ETHIOPIC SYLLABLE JWA -1308 ETHIOPIC SYLLABLE GA -1309 ETHIOPIC SYLLABLE GU -130A ETHIOPIC SYLLABLE GI -130B ETHIOPIC SYLLABLE GAA -130C ETHIOPIC SYLLABLE GEE -130D ETHIOPIC SYLLABLE GE -130E ETHIOPIC SYLLABLE GO -1310 ETHIOPIC SYLLABLE GWA -1312 ETHIOPIC SYLLABLE GWI -1313 ETHIOPIC SYLLABLE GWAA -1314 ETHIOPIC SYLLABLE GWEE -1315 ETHIOPIC SYLLABLE GWE -1318 ETHIOPIC SYLLABLE GGA -1319 ETHIOPIC SYLLABLE GGU -131A ETHIOPIC SYLLABLE GGI -131B ETHIOPIC SYLLABLE GGAA -131C ETHIOPIC SYLLABLE GGEE -131D ETHIOPIC SYLLABLE GGE -131E ETHIOPIC SYLLABLE GGO -131F ETHIOPIC SYLLABLE GGWAA -1320 ETHIOPIC SYLLABLE THA -1321 ETHIOPIC SYLLABLE THU -1322 ETHIOPIC SYLLABLE THI -1323 ETHIOPIC SYLLABLE THAA -1324 ETHIOPIC SYLLABLE THEE -1325 ETHIOPIC SYLLABLE THE -1326 ETHIOPIC SYLLABLE THO -1327 ETHIOPIC SYLLABLE THWA -1328 ETHIOPIC SYLLABLE CHA -1329 ETHIOPIC SYLLABLE CHU -132A ETHIOPIC SYLLABLE CHI -132B ETHIOPIC SYLLABLE CHAA -132C ETHIOPIC SYLLABLE CHEE -132D ETHIOPIC SYLLABLE CHE -132E ETHIOPIC SYLLABLE CHO -132F ETHIOPIC SYLLABLE CHWA -1330 ETHIOPIC SYLLABLE PHA -1331 ETHIOPIC SYLLABLE PHU -1332 ETHIOPIC SYLLABLE PHI -1333 ETHIOPIC SYLLABLE PHAA -1334 ETHIOPIC SYLLABLE PHEE -1335 ETHIOPIC SYLLABLE PHE -1336 ETHIOPIC SYLLABLE PHO -1337 ETHIOPIC SYLLABLE PHWA -1338 ETHIOPIC SYLLABLE TSA -1339 ETHIOPIC SYLLABLE TSU -133A ETHIOPIC SYLLABLE TSI -133B ETHIOPIC SYLLABLE TSAA -133C ETHIOPIC SYLLABLE TSEE -133D ETHIOPIC SYLLABLE TSE -133E ETHIOPIC SYLLABLE TSO -133F ETHIOPIC SYLLABLE TSWA -1340 ETHIOPIC SYLLABLE TZA -1341 ETHIOPIC SYLLABLE TZU -1342 ETHIOPIC SYLLABLE TZI -1343 ETHIOPIC SYLLABLE TZAA -1344 ETHIOPIC SYLLABLE TZEE -1345 ETHIOPIC SYLLABLE TZE -1346 ETHIOPIC SYLLABLE TZO -1348 ETHIOPIC SYLLABLE FA -1349 ETHIOPIC SYLLABLE FU -134A ETHIOPIC SYLLABLE FI -134B ETHIOPIC SYLLABLE FAA -134C ETHIOPIC SYLLABLE FEE -134D ETHIOPIC SYLLABLE FE -134E ETHIOPIC SYLLABLE FO -134F ETHIOPIC SYLLABLE FWAA -1350 ETHIOPIC SYLLABLE PA -1351 ETHIOPIC SYLLABLE PU -1352 ETHIOPIC SYLLABLE PI -1353 ETHIOPIC SYLLABLE PAA -1354 ETHIOPIC SYLLABLE PEE -1355 ETHIOPIC SYLLABLE PE -1356 ETHIOPIC SYLLABLE PO -1357 ETHIOPIC SYLLABLE PWAA -1358 ETHIOPIC SYLLABLE MYA -1359 ETHIOPIC SYLLABLE RYA -135A ETHIOPIC SYLLABLE FYA -1361 ETHIOPIC WORDSPACE -1362 ETHIOPIC FULL STOP -1363 ETHIOPIC COMMA -1364 ETHIOPIC SEMICOLON -1365 ETHIOPIC COLON -1366 ETHIOPIC PREFACE COLON -1367 ETHIOPIC QUESTION MARK -1368 ETHIOPIC PARAGRAPH SEPARATOR -1369 ETHIOPIC DIGIT ONE -136A ETHIOPIC DIGIT TWO -136B ETHIOPIC DIGIT THREE -136C ETHIOPIC DIGIT FOUR -136D ETHIOPIC DIGIT FIVE -136E ETHIOPIC DIGIT SIX -136F ETHIOPIC DIGIT SEVEN -1370 ETHIOPIC DIGIT EIGHT -1371 ETHIOPIC DIGIT NINE -1372 ETHIOPIC NUMBER TEN -1373 ETHIOPIC NUMBER TWENTY -1374 ETHIOPIC NUMBER THIRTY -1375 ETHIOPIC NUMBER FORTY -1376 ETHIOPIC NUMBER FIFTY -1377 ETHIOPIC NUMBER SIXTY -1378 ETHIOPIC NUMBER SEVENTY -1379 ETHIOPIC NUMBER EIGHTY -137A ETHIOPIC NUMBER NINETY -137B ETHIOPIC NUMBER HUNDRED -137C ETHIOPIC NUMBER TEN THOUSAND +1200 ETHIOPIC SYLLABLE HA +1201 ETHIOPIC SYLLABLE HU +1202 ETHIOPIC SYLLABLE HI +1203 ETHIOPIC SYLLABLE HAA +1204 ETHIOPIC SYLLABLE HEE +1205 ETHIOPIC SYLLABLE HE +1206 ETHIOPIC SYLLABLE HO +1208 ETHIOPIC SYLLABLE LA +1209 ETHIOPIC SYLLABLE LU +120a ETHIOPIC SYLLABLE LI +120b ETHIOPIC SYLLABLE LAA +120c ETHIOPIC SYLLABLE LEE +120d ETHIOPIC SYLLABLE LE +120e ETHIOPIC SYLLABLE LO +120f ETHIOPIC SYLLABLE LWA +1210 ETHIOPIC SYLLABLE HHA +1211 ETHIOPIC SYLLABLE HHU +1212 ETHIOPIC SYLLABLE HHI +1213 ETHIOPIC SYLLABLE HHAA +1214 ETHIOPIC SYLLABLE HHEE +1215 ETHIOPIC SYLLABLE HHE +1216 ETHIOPIC SYLLABLE HHO +1217 ETHIOPIC SYLLABLE HHWA +1218 ETHIOPIC SYLLABLE MA +1219 ETHIOPIC SYLLABLE MU +121a ETHIOPIC SYLLABLE MI +121b ETHIOPIC SYLLABLE MAA +121c ETHIOPIC SYLLABLE MEE +121d ETHIOPIC SYLLABLE ME +121e ETHIOPIC SYLLABLE MO +121f ETHIOPIC SYLLABLE MWA +1220 ETHIOPIC SYLLABLE SZA +1221 ETHIOPIC SYLLABLE SZU +1222 ETHIOPIC SYLLABLE SZI +1223 ETHIOPIC SYLLABLE SZAA +1224 ETHIOPIC SYLLABLE SZEE +1225 ETHIOPIC SYLLABLE SZE +1226 ETHIOPIC SYLLABLE SZO +1227 ETHIOPIC SYLLABLE SZWA +1228 ETHIOPIC SYLLABLE RA +1229 ETHIOPIC SYLLABLE RU +122a ETHIOPIC SYLLABLE RI +122b ETHIOPIC SYLLABLE RAA +122c ETHIOPIC SYLLABLE REE +122d ETHIOPIC SYLLABLE RE +122e ETHIOPIC SYLLABLE RO +122f ETHIOPIC SYLLABLE RWA +1230 ETHIOPIC SYLLABLE SA +1231 ETHIOPIC SYLLABLE SU +1232 ETHIOPIC SYLLABLE SI +1233 ETHIOPIC SYLLABLE SAA +1234 ETHIOPIC SYLLABLE SEE +1235 ETHIOPIC SYLLABLE SE +1236 ETHIOPIC SYLLABLE SO +1237 ETHIOPIC SYLLABLE SWA +1238 ETHIOPIC SYLLABLE SHA +1239 ETHIOPIC SYLLABLE SHU +123a ETHIOPIC SYLLABLE SHI +123b ETHIOPIC SYLLABLE SHAA +123c ETHIOPIC SYLLABLE SHEE +123d ETHIOPIC SYLLABLE SHE +123e ETHIOPIC SYLLABLE SHO +123f ETHIOPIC SYLLABLE SHWA +1240 ETHIOPIC SYLLABLE QA +1241 ETHIOPIC SYLLABLE QU +1242 ETHIOPIC SYLLABLE QI +1243 ETHIOPIC SYLLABLE QAA +1244 ETHIOPIC SYLLABLE QEE +1245 ETHIOPIC SYLLABLE QE +1246 ETHIOPIC SYLLABLE QO +1248 ETHIOPIC SYLLABLE QWA +124a ETHIOPIC SYLLABLE QWI +124b ETHIOPIC SYLLABLE QWAA +124c ETHIOPIC SYLLABLE QWEE +124d ETHIOPIC SYLLABLE QWE +1250 ETHIOPIC SYLLABLE QHA +1251 ETHIOPIC SYLLABLE QHU +1252 ETHIOPIC SYLLABLE QHI +1253 ETHIOPIC SYLLABLE QHAA +1254 ETHIOPIC SYLLABLE QHEE +1255 ETHIOPIC SYLLABLE QHE +1256 ETHIOPIC SYLLABLE QHO +1258 ETHIOPIC SYLLABLE QHWA +125a ETHIOPIC SYLLABLE QHWI +125b ETHIOPIC SYLLABLE QHWAA +125c ETHIOPIC SYLLABLE QHWEE +125d ETHIOPIC SYLLABLE QHWE +1260 ETHIOPIC SYLLABLE BA +1261 ETHIOPIC SYLLABLE BU +1262 ETHIOPIC SYLLABLE BI +1263 ETHIOPIC SYLLABLE BAA +1264 ETHIOPIC SYLLABLE BEE +1265 ETHIOPIC SYLLABLE BE +1266 ETHIOPIC SYLLABLE BO +1267 ETHIOPIC SYLLABLE BWA +1268 ETHIOPIC SYLLABLE VA +1269 ETHIOPIC SYLLABLE VU +126a ETHIOPIC SYLLABLE VI +126b ETHIOPIC SYLLABLE VAA +126c ETHIOPIC SYLLABLE VEE +126d ETHIOPIC SYLLABLE VE +126e ETHIOPIC SYLLABLE VO +126f ETHIOPIC SYLLABLE VWA +1270 ETHIOPIC SYLLABLE TA +1271 ETHIOPIC SYLLABLE TU +1272 ETHIOPIC SYLLABLE TI +1273 ETHIOPIC SYLLABLE TAA +1274 ETHIOPIC SYLLABLE TEE +1275 ETHIOPIC SYLLABLE TE +1276 ETHIOPIC SYLLABLE TO +1277 ETHIOPIC SYLLABLE TWA +1278 ETHIOPIC SYLLABLE CA +1279 ETHIOPIC SYLLABLE CU +127a ETHIOPIC SYLLABLE CI +127b ETHIOPIC SYLLABLE CAA +127c ETHIOPIC SYLLABLE CEE +127d ETHIOPIC SYLLABLE CE +127e ETHIOPIC SYLLABLE CO +127f ETHIOPIC SYLLABLE CWA +1280 ETHIOPIC SYLLABLE XA +1281 ETHIOPIC SYLLABLE XU +1282 ETHIOPIC SYLLABLE XI +1283 ETHIOPIC SYLLABLE XAA +1284 ETHIOPIC SYLLABLE XEE +1285 ETHIOPIC SYLLABLE XE +1286 ETHIOPIC SYLLABLE XO +1288 ETHIOPIC SYLLABLE XWA +128a ETHIOPIC SYLLABLE XWI +128b ETHIOPIC SYLLABLE XWAA +128c ETHIOPIC SYLLABLE XWEE +128d ETHIOPIC SYLLABLE XWE +1290 ETHIOPIC SYLLABLE NA +1291 ETHIOPIC SYLLABLE NU +1292 ETHIOPIC SYLLABLE NI +1293 ETHIOPIC SYLLABLE NAA +1294 ETHIOPIC SYLLABLE NEE +1295 ETHIOPIC SYLLABLE NE +1296 ETHIOPIC SYLLABLE NO +1297 ETHIOPIC SYLLABLE NWA +1298 ETHIOPIC SYLLABLE NYA +1299 ETHIOPIC SYLLABLE NYU +129a ETHIOPIC SYLLABLE NYI +129b ETHIOPIC SYLLABLE NYAA +129c ETHIOPIC SYLLABLE NYEE +129d ETHIOPIC SYLLABLE NYE +129e ETHIOPIC SYLLABLE NYO +129f ETHIOPIC SYLLABLE NYWA +12a0 ETHIOPIC SYLLABLE GLOTTAL A +12a1 ETHIOPIC SYLLABLE GLOTTAL U +12a2 ETHIOPIC SYLLABLE GLOTTAL I +12a3 ETHIOPIC SYLLABLE GLOTTAL AA +12a4 ETHIOPIC SYLLABLE GLOTTAL EE +12a5 ETHIOPIC SYLLABLE GLOTTAL E +12a6 ETHIOPIC SYLLABLE GLOTTAL O +12a7 ETHIOPIC SYLLABLE GLOTTAL WA +12a8 ETHIOPIC SYLLABLE KA +12a9 ETHIOPIC SYLLABLE KU +12aa ETHIOPIC SYLLABLE KI +12ab ETHIOPIC SYLLABLE KAA +12ac ETHIOPIC SYLLABLE KEE +12ad ETHIOPIC SYLLABLE KE +12ae ETHIOPIC SYLLABLE KO +12b0 ETHIOPIC SYLLABLE KWA +12b2 ETHIOPIC SYLLABLE KWI +12b3 ETHIOPIC SYLLABLE KWAA +12b4 ETHIOPIC SYLLABLE KWEE +12b5 ETHIOPIC SYLLABLE KWE +12b8 ETHIOPIC SYLLABLE KXA +12b9 ETHIOPIC SYLLABLE KXU +12ba ETHIOPIC SYLLABLE KXI +12bb ETHIOPIC SYLLABLE KXAA +12bc ETHIOPIC SYLLABLE KXEE +12bd ETHIOPIC SYLLABLE KXE +12be ETHIOPIC SYLLABLE KXO +12c0 ETHIOPIC SYLLABLE KXWA +12c2 ETHIOPIC SYLLABLE KXWI +12c3 ETHIOPIC SYLLABLE KXWAA +12c4 ETHIOPIC SYLLABLE KXWEE +12c5 ETHIOPIC SYLLABLE KXWE +12c8 ETHIOPIC SYLLABLE WA +12c9 ETHIOPIC SYLLABLE WU +12ca ETHIOPIC SYLLABLE WI +12cb ETHIOPIC SYLLABLE WAA +12cc ETHIOPIC SYLLABLE WEE +12cd ETHIOPIC SYLLABLE WE +12ce ETHIOPIC SYLLABLE WO +12d0 ETHIOPIC SYLLABLE PHARYNGEAL A +12d1 ETHIOPIC SYLLABLE PHARYNGEAL U +12d2 ETHIOPIC SYLLABLE PHARYNGEAL I +12d3 ETHIOPIC SYLLABLE PHARYNGEAL AA +12d4 ETHIOPIC SYLLABLE PHARYNGEAL EE +12d5 ETHIOPIC SYLLABLE PHARYNGEAL E +12d6 ETHIOPIC SYLLABLE PHARYNGEAL O +12d8 ETHIOPIC SYLLABLE ZA +12d9 ETHIOPIC SYLLABLE ZU +12da ETHIOPIC SYLLABLE ZI +12db ETHIOPIC SYLLABLE ZAA +12dc ETHIOPIC SYLLABLE ZEE +12dd ETHIOPIC SYLLABLE ZE +12de ETHIOPIC SYLLABLE ZO +12df ETHIOPIC SYLLABLE ZWA +12e0 ETHIOPIC SYLLABLE ZHA +12e1 ETHIOPIC SYLLABLE ZHU +12e2 ETHIOPIC SYLLABLE ZHI +12e3 ETHIOPIC SYLLABLE ZHAA +12e4 ETHIOPIC SYLLABLE ZHEE +12e5 ETHIOPIC SYLLABLE ZHE +12e6 ETHIOPIC SYLLABLE ZHO +12e7 ETHIOPIC SYLLABLE ZHWA +12e8 ETHIOPIC SYLLABLE YA +12e9 ETHIOPIC SYLLABLE YU +12ea ETHIOPIC SYLLABLE YI +12eb ETHIOPIC SYLLABLE YAA +12ec ETHIOPIC SYLLABLE YEE +12ed ETHIOPIC SYLLABLE YE +12ee ETHIOPIC SYLLABLE YO +12f0 ETHIOPIC SYLLABLE DA +12f1 ETHIOPIC SYLLABLE DU +12f2 ETHIOPIC SYLLABLE DI +12f3 ETHIOPIC SYLLABLE DAA +12f4 ETHIOPIC SYLLABLE DEE +12f5 ETHIOPIC SYLLABLE DE +12f6 ETHIOPIC SYLLABLE DO +12f7 ETHIOPIC SYLLABLE DWA +12f8 ETHIOPIC SYLLABLE DDA +12f9 ETHIOPIC SYLLABLE DDU +12fa ETHIOPIC SYLLABLE DDI +12fb ETHIOPIC SYLLABLE DDAA +12fc ETHIOPIC SYLLABLE DDEE +12fd ETHIOPIC SYLLABLE DDE +12fe ETHIOPIC SYLLABLE DDO +12ff ETHIOPIC SYLLABLE DDWA +1300 ETHIOPIC SYLLABLE JA +1301 ETHIOPIC SYLLABLE JU +1302 ETHIOPIC SYLLABLE JI +1303 ETHIOPIC SYLLABLE JAA +1304 ETHIOPIC SYLLABLE JEE +1305 ETHIOPIC SYLLABLE JE +1306 ETHIOPIC SYLLABLE JO +1307 ETHIOPIC SYLLABLE JWA +1308 ETHIOPIC SYLLABLE GA +1309 ETHIOPIC SYLLABLE GU +130a ETHIOPIC SYLLABLE GI +130b ETHIOPIC SYLLABLE GAA +130c ETHIOPIC SYLLABLE GEE +130d ETHIOPIC SYLLABLE GE +130e ETHIOPIC SYLLABLE GO +1310 ETHIOPIC SYLLABLE GWA +1312 ETHIOPIC SYLLABLE GWI +1313 ETHIOPIC SYLLABLE GWAA +1314 ETHIOPIC SYLLABLE GWEE +1315 ETHIOPIC SYLLABLE GWE +1318 ETHIOPIC SYLLABLE GGA +1319 ETHIOPIC SYLLABLE GGU +131a ETHIOPIC SYLLABLE GGI +131b ETHIOPIC SYLLABLE GGAA +131c ETHIOPIC SYLLABLE GGEE +131d ETHIOPIC SYLLABLE GGE +131e ETHIOPIC SYLLABLE GGO +1320 ETHIOPIC SYLLABLE THA +1321 ETHIOPIC SYLLABLE THU +1322 ETHIOPIC SYLLABLE THI +1323 ETHIOPIC SYLLABLE THAA +1324 ETHIOPIC SYLLABLE THEE +1325 ETHIOPIC SYLLABLE THE +1326 ETHIOPIC SYLLABLE THO +1327 ETHIOPIC SYLLABLE THWA +1328 ETHIOPIC SYLLABLE CHA +1329 ETHIOPIC SYLLABLE CHU +132a ETHIOPIC SYLLABLE CHI +132b ETHIOPIC SYLLABLE CHAA +132c ETHIOPIC SYLLABLE CHEE +132d ETHIOPIC SYLLABLE CHE +132e ETHIOPIC SYLLABLE CHO +132f ETHIOPIC SYLLABLE CHWA +1330 ETHIOPIC SYLLABLE PHA +1331 ETHIOPIC SYLLABLE PHU +1332 ETHIOPIC SYLLABLE PHI +1333 ETHIOPIC SYLLABLE PHAA +1334 ETHIOPIC SYLLABLE PHEE +1335 ETHIOPIC SYLLABLE PHE +1336 ETHIOPIC SYLLABLE PHO +1337 ETHIOPIC SYLLABLE PHWA +1338 ETHIOPIC SYLLABLE TSA +1339 ETHIOPIC SYLLABLE TSU +133a ETHIOPIC SYLLABLE TSI +133b ETHIOPIC SYLLABLE TSAA +133c ETHIOPIC SYLLABLE TSEE +133d ETHIOPIC SYLLABLE TSE +133e ETHIOPIC SYLLABLE TSO +133f ETHIOPIC SYLLABLE TSWA +1340 ETHIOPIC SYLLABLE TZA +1341 ETHIOPIC SYLLABLE TZU +1342 ETHIOPIC SYLLABLE TZI +1343 ETHIOPIC SYLLABLE TZAA +1344 ETHIOPIC SYLLABLE TZEE +1345 ETHIOPIC SYLLABLE TZE +1346 ETHIOPIC SYLLABLE TZO +1348 ETHIOPIC SYLLABLE FA +1349 ETHIOPIC SYLLABLE FU +134a ETHIOPIC SYLLABLE FI +134b ETHIOPIC SYLLABLE FAA +134c ETHIOPIC SYLLABLE FEE +134d ETHIOPIC SYLLABLE FE +134e ETHIOPIC SYLLABLE FO +134f ETHIOPIC SYLLABLE FWA +1350 ETHIOPIC SYLLABLE PA +1351 ETHIOPIC SYLLABLE PU +1352 ETHIOPIC SYLLABLE PI +1353 ETHIOPIC SYLLABLE PAA +1354 ETHIOPIC SYLLABLE PEE +1355 ETHIOPIC SYLLABLE PE +1356 ETHIOPIC SYLLABLE PO +1357 ETHIOPIC SYLLABLE PWA +1358 ETHIOPIC SYLLABLE RYA +1359 ETHIOPIC SYLLABLE MYA +135a ETHIOPIC SYLLABLE FYA +1361 ETHIOPIC WORDSPACE +1362 ETHIOPIC FULL STOP +1363 ETHIOPIC COMMA +1364 ETHIOPIC SEMICOLON +1365 ETHIOPIC COLON +1366 ETHIOPIC PREFACE COLON +1367 ETHIOPIC QUESTION MARK +1368 ETHIOPIC PARAGRAPH SEPARATOR +1369 ETHIOPIC DIGIT ONE +136a ETHIOPIC DIGIT TWO +136b ETHIOPIC DIGIT THREE +136c ETHIOPIC DIGIT FOUR +136d ETHIOPIC DIGIT FIVE +136e ETHIOPIC DIGIT SIX +136f ETHIOPIC DIGIT SEVEN +1370 ETHIOPIC DIGIT EIGHT +1371 ETHIOPIC DIGIT NINE +1372 ETHIOPIC NUMBER TEN +1373 ETHIOPIC NUMBER TWENTY +1374 ETHIOPIC NUMBER THIRTY +1375 ETHIOPIC NUMBER FORTY +1376 ETHIOPIC NUMBER FIFTY +1377 ETHIOPIC NUMBER SIXTY +1378 ETHIOPIC NUMBER SEVENTY +1379 ETHIOPIC NUMBER EIGHTY +137a ETHIOPIC NUMBER NINETY +137b ETHIOPIC NUMBER HUNDRED +137c ETHIOPIC NUMBER TEN THOUSAND 1e00 LATIN CAPITAL LETTER A WITH RING BELOW 1e01 LATIN SMALL LETTER A WITH RING BELOW 1e02 LATIN CAPITAL LETTER B WITH DOT ABOVE diff --git a/lib/unicode/To/Digit.pl b/lib/unicode/To/Digit.pl index 8f60c4f..7ccd849 100644 --- a/lib/unicode/To/Digit.pl +++ b/lib/unicode/To/Digit.pl @@ -16,6 +16,7 @@ return <<'END'; 0e50 0e59 0000 0ed0 0ed9 0000 0f20 0f29 0000 +1369 1371 0001 2070 0000 2074 2079 0004 2080 2089 0000 diff --git a/lib/unicode/mktables.PL b/lib/unicode/mktables.PL index 306f2a4..82d8307 100755 --- a/lib/unicode/mktables.PL +++ b/lib/unicode/mktables.PL @@ -9,17 +9,23 @@ mkdir "To", 0777; @todo = ( # typical - ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''], - ['IsAlpha', '$cat =~ /^L[ulo]/', ''], - ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''], - ['IsDigit', '$cat =~ /^Nd$/', ''], - ['IsUpper', '$cat =~ /^Lu$/', ''], - ['IsLower', '$cat =~ /^Ll$/', ''], - ['IsPrint', '$cat =~ /^[^C]/', ''], - ['ToUpper', '$up', '$up'], - ['ToLower', '$down', '$down'], - ['ToTitle', '$title', '$title'], - ['ToDigit', '$dec ne ""', '$dec'], + ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''], + ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''], + ['IsAlpha', '$cat =~ /^L[ulo]/', ''], + ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''], + ['IsDigit', '$cat =~ /^Nd$/', ''], + ['IsUpper', '$cat =~ /^Lu$/', ''], + ['IsLower', '$cat =~ /^Ll$/', ''], + ['IsASCII', 'hex $code <= 127', ''], + ['IsCntrl', '$cat =~ /^C/', ''], + ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''], + ['IsPrint', '$cat =~ /^[^C]/', ''], + ['IsPunct', '$cat =~ /^P/', ''], + ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''], + ['ToUpper', '$up', '$up'], + ['ToLower', '$down', '$down'], + ['ToTitle', '$title', '$title'], + ['ToDigit', '$dec ne ""', '$dec'], # Name diff --git a/objXSUB.h b/objXSUB.h index d91f84d..6f201dc 100644 --- a/objXSUB.h +++ b/objXSUB.h @@ -742,16 +742,26 @@ #define PL_unsafe pPerl->PL_unsafe #undef PL_utf8_alnum #define PL_utf8_alnum pPerl->PL_utf8_alnum +#undef PL_utf8_alnumc +#define PL_utf8_alnumc pPerl->PL_utf8_alnumc #undef PL_utf8_alpha #define PL_utf8_alpha pPerl->PL_utf8_alpha +#undef PL_utf8_ascii +#define PL_utf8_ascii pPerl->PL_utf8_ascii +#undef PL_utf8_cntrl +#define PL_utf8_cntrl pPerl->PL_utf8_cntrl #undef PL_utf8_digit #define PL_utf8_digit pPerl->PL_utf8_digit +#undef PL_utf8_graph +#define PL_utf8_graph pPerl->PL_utf8_graph #undef PL_utf8_lower #define PL_utf8_lower pPerl->PL_utf8_lower #undef PL_utf8_mark #define PL_utf8_mark pPerl->PL_utf8_mark #undef PL_utf8_print #define PL_utf8_print pPerl->PL_utf8_print +#undef PL_utf8_punct +#define PL_utf8_punct pPerl->PL_utf8_punct #undef PL_utf8_space #define PL_utf8_space pPerl->PL_utf8_space #undef PL_utf8_tolower @@ -762,6 +772,8 @@ #define PL_utf8_toupper pPerl->PL_utf8_toupper #undef PL_utf8_upper #define PL_utf8_upper pPerl->PL_utf8_upper +#undef PL_utf8_xdigit +#define PL_utf8_xdigit pPerl->PL_utf8_xdigit #undef PL_uudmap #define PL_uudmap pPerl->PL_uudmap #undef PL_warnhook @@ -1505,6 +1517,10 @@ #define Perl_is_uni_alnum pPerl->Perl_is_uni_alnum #undef is_uni_alnum #define is_uni_alnum Perl_is_uni_alnum +#undef Perl_is_uni_alnumc +#define Perl_is_uni_alnumc pPerl->Perl_is_uni_alnumc +#undef is_uni_alnumc +#define is_uni_alnumc Perl_is_uni_alnumc #undef Perl_is_uni_idfirst #define Perl_is_uni_idfirst pPerl->Perl_is_uni_idfirst #undef is_uni_idfirst @@ -1513,10 +1529,22 @@ #define Perl_is_uni_alpha pPerl->Perl_is_uni_alpha #undef is_uni_alpha #define is_uni_alpha Perl_is_uni_alpha +#undef Perl_is_uni_ascii +#define Perl_is_uni_ascii pPerl->Perl_is_uni_ascii +#undef is_uni_ascii +#define is_uni_ascii Perl_is_uni_ascii #undef Perl_is_uni_space #define Perl_is_uni_space pPerl->Perl_is_uni_space #undef is_uni_space #define is_uni_space Perl_is_uni_space +#undef Perl_is_uni_cntrl +#define Perl_is_uni_cntrl pPerl->Perl_is_uni_cntrl +#undef is_uni_cntrl +#define is_uni_cntrl Perl_is_uni_cntrl +#undef Perl_is_uni_graph +#define Perl_is_uni_graph pPerl->Perl_is_uni_graph +#undef is_uni_graph +#define is_uni_graph Perl_is_uni_graph #undef Perl_is_uni_digit #define Perl_is_uni_digit pPerl->Perl_is_uni_digit #undef is_uni_digit @@ -1533,6 +1561,14 @@ #define Perl_is_uni_print pPerl->Perl_is_uni_print #undef is_uni_print #define is_uni_print Perl_is_uni_print +#undef Perl_is_uni_punct +#define Perl_is_uni_punct pPerl->Perl_is_uni_punct +#undef is_uni_punct +#define is_uni_punct Perl_is_uni_punct +#undef Perl_is_uni_xdigit +#define Perl_is_uni_xdigit pPerl->Perl_is_uni_xdigit +#undef is_uni_xdigit +#define is_uni_xdigit Perl_is_uni_xdigit #undef Perl_to_uni_upper #define Perl_to_uni_upper pPerl->Perl_to_uni_upper #undef to_uni_upper @@ -1549,6 +1585,10 @@ #define Perl_is_uni_alnum_lc pPerl->Perl_is_uni_alnum_lc #undef is_uni_alnum_lc #define is_uni_alnum_lc Perl_is_uni_alnum_lc +#undef Perl_is_uni_alnumc_lc +#define Perl_is_uni_alnumc_lc pPerl->Perl_is_uni_alnumc_lc +#undef is_uni_alnumc_lc +#define is_uni_alnumc_lc Perl_is_uni_alnumc_lc #undef Perl_is_uni_idfirst_lc #define Perl_is_uni_idfirst_lc pPerl->Perl_is_uni_idfirst_lc #undef is_uni_idfirst_lc @@ -1557,10 +1597,22 @@ #define Perl_is_uni_alpha_lc pPerl->Perl_is_uni_alpha_lc #undef is_uni_alpha_lc #define is_uni_alpha_lc Perl_is_uni_alpha_lc +#undef Perl_is_uni_ascii_lc +#define Perl_is_uni_ascii_lc pPerl->Perl_is_uni_ascii_lc +#undef is_uni_ascii_lc +#define is_uni_ascii_lc Perl_is_uni_ascii_lc #undef Perl_is_uni_space_lc #define Perl_is_uni_space_lc pPerl->Perl_is_uni_space_lc #undef is_uni_space_lc #define is_uni_space_lc Perl_is_uni_space_lc +#undef Perl_is_uni_cntrl_lc +#define Perl_is_uni_cntrl_lc pPerl->Perl_is_uni_cntrl_lc +#undef is_uni_cntrl_lc +#define is_uni_cntrl_lc Perl_is_uni_cntrl_lc +#undef Perl_is_uni_graph_lc +#define Perl_is_uni_graph_lc pPerl->Perl_is_uni_graph_lc +#undef is_uni_graph_lc +#define is_uni_graph_lc Perl_is_uni_graph_lc #undef Perl_is_uni_digit_lc #define Perl_is_uni_digit_lc pPerl->Perl_is_uni_digit_lc #undef is_uni_digit_lc @@ -1577,6 +1629,14 @@ #define Perl_is_uni_print_lc pPerl->Perl_is_uni_print_lc #undef is_uni_print_lc #define is_uni_print_lc Perl_is_uni_print_lc +#undef Perl_is_uni_punct_lc +#define Perl_is_uni_punct_lc pPerl->Perl_is_uni_punct_lc +#undef is_uni_punct_lc +#define is_uni_punct_lc Perl_is_uni_punct_lc +#undef Perl_is_uni_xdigit_lc +#define Perl_is_uni_xdigit_lc pPerl->Perl_is_uni_xdigit_lc +#undef is_uni_xdigit_lc +#define is_uni_xdigit_lc Perl_is_uni_xdigit_lc #undef Perl_to_uni_upper_lc #define Perl_to_uni_upper_lc pPerl->Perl_to_uni_upper_lc #undef to_uni_upper_lc @@ -1593,6 +1653,10 @@ #define Perl_is_utf8_alnum pPerl->Perl_is_utf8_alnum #undef is_utf8_alnum #define is_utf8_alnum Perl_is_utf8_alnum +#undef Perl_is_utf8_alnumc +#define Perl_is_utf8_alnumc pPerl->Perl_is_utf8_alnumc +#undef is_utf8_alnumc +#define is_utf8_alnumc Perl_is_utf8_alnumc #undef Perl_is_utf8_idfirst #define Perl_is_utf8_idfirst pPerl->Perl_is_utf8_idfirst #undef is_utf8_idfirst @@ -1601,14 +1665,26 @@ #define Perl_is_utf8_alpha pPerl->Perl_is_utf8_alpha #undef is_utf8_alpha #define is_utf8_alpha Perl_is_utf8_alpha +#undef Perl_is_utf8_ascii +#define Perl_is_utf8_ascii pPerl->Perl_is_utf8_ascii +#undef is_utf8_ascii +#define is_utf8_ascii Perl_is_utf8_ascii #undef Perl_is_utf8_space #define Perl_is_utf8_space pPerl->Perl_is_utf8_space #undef is_utf8_space #define is_utf8_space Perl_is_utf8_space +#undef Perl_is_utf8_cntrl +#define Perl_is_utf8_cntrl pPerl->Perl_is_utf8_cntrl +#undef is_utf8_cntrl +#define is_utf8_cntrl Perl_is_utf8_cntrl #undef Perl_is_utf8_digit #define Perl_is_utf8_digit pPerl->Perl_is_utf8_digit #undef is_utf8_digit #define is_utf8_digit Perl_is_utf8_digit +#undef Perl_is_utf8_graph +#define Perl_is_utf8_graph pPerl->Perl_is_utf8_graph +#undef is_utf8_graph +#define is_utf8_graph Perl_is_utf8_graph #undef Perl_is_utf8_upper #define Perl_is_utf8_upper pPerl->Perl_is_utf8_upper #undef is_utf8_upper @@ -1621,6 +1697,14 @@ #define Perl_is_utf8_print pPerl->Perl_is_utf8_print #undef is_utf8_print #define is_utf8_print Perl_is_utf8_print +#undef Perl_is_utf8_punct +#define Perl_is_utf8_punct pPerl->Perl_is_utf8_punct +#undef is_utf8_punct +#define is_utf8_punct Perl_is_utf8_punct +#undef Perl_is_utf8_xdigit +#define Perl_is_utf8_xdigit pPerl->Perl_is_utf8_xdigit +#undef is_utf8_xdigit +#define is_utf8_xdigit Perl_is_utf8_xdigit #undef Perl_is_utf8_mark #define Perl_is_utf8_mark pPerl->Perl_is_utf8_mark #undef is_utf8_mark @@ -3509,6 +3593,10 @@ #define Perl_ck_rvconst pPerl->Perl_ck_rvconst #undef ck_rvconst #define ck_rvconst Perl_ck_rvconst +#undef Perl_ck_sassign +#define Perl_ck_sassign pPerl->Perl_ck_sassign +#undef ck_sassign +#define ck_sassign Perl_ck_sassign #undef Perl_ck_scmp #define Perl_ck_scmp pPerl->Perl_ck_scmp #undef ck_scmp diff --git a/pod/perldelta.pod b/pod/perldelta.pod index de727db..2278a54 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -121,6 +121,13 @@ Unix and UNICOS also have 64-bit support. =head2 Better syntax checks on parenthesized unary operators +TODO + +=head2 POSIX character class syntax [: :] supported + +For example to match alphabetic characters use /[[:alpha:]]/. +See L for details. + Expressions such as: print defined(&foo,&bar,&baz); diff --git a/pod/perldiag.pod b/pod/perldiag.pod index d7b9024..b352e9c 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1000,21 +1000,23 @@ there is no builtin with the name C. opposed to a subroutine reference): no such method callable via the package. If method name is C, this is an internal error. -=item Character class syntax [. .] is reserved for future extensions +=item Character class [:%s:] unknown -(W) Within regular expression character classes ([]) the syntax beginning -with "[." and ending with ".]" is reserved for future extensions. -If you need to represent those character sequences inside a regular -expression character class, just quote the square brackets with the -backslash: "\[." and ".\]". +(F) The class in the character class [: :] syntax is unknown. -=item Character class syntax [: :] is reserved for future extensions +=item Character class syntax [%s] belongs inside character classes + +(W) The character class constructs [: :], [= =], and [. .] go +I character classes, the [] are part of the construct. For +example: /[[:alpha:]]/ + +=item Character class syntax [ .] is reserved for future extensions (W) Within regular expression character classes ([]) the syntax beginning -with "[:" and ending with ":]" is reserved for future extensions. +with "[." and ending with ".]" is reserved for future extensions. If you need to represent those character sequences inside a regular expression character class, just quote the square brackets with the -backslash: "\[:" and ":\]". +backslash: "\[." and ".\]". =item Character class syntax [= =] is reserved for future extensions diff --git a/pod/perlre.pod b/pod/perlre.pod index ca95638..470c593 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -186,6 +186,100 @@ current locale. See L. You may use C<\w>, C<\W>, C<\s>, C<\S>, C<\d>, and C<\D> within character classes (though not as either end of a range). See L for details about C<\pP>, C<\PP>, and C<\X>. +The POSIX character class syntax + + [:class:] + +is also available. The available classes and their \-equivalents +(if any) are as follows: + + alpha + alnum + ascii + cntrl + digit \d + graph + lower + print + punct + space \s + upper + word \w + xdigit + +Note that the [] are part of the [::] construct, not part of the whole +character class. For example: + + [01[:alpha:]%] + +matches one, zero, any alphabetic character, and the percentage sign. + +The exact meanings of the above classes depend from many things: +if the C pragma is used, the following equivalenced to Unicode +\p{} constructs hold: + + alpha IsAlpha + alnum IsAlnum + ascii IsASCII + cntrl IsCntrl + digit IsDigit + graph IsGraph + lower IsLower + print IsPrint + punct IsPunct + space IsSpace + upper IsUpper + word IsWord + xdigit IsXDigit + +For example, [:lower:] and \p{IsLower} are equivalent. + +If the C pragma is not used but the C pragma is, the +classes correlate with the isalpha(3) interface (except for `word', +which is a Perl extension). + +The assumedly non-obviously named classes are: + +=over 4 + +=item cntrl + + Any control character. Usually characters that don't produce + output as such but instead control the terminal somehow: + for example newline and backspace are control characters. + +=item graph + + Any alphanumeric or punctuation character. + +=item print + + Any alphanumeric or punctuation character or space. + +=item punct + + Any punctuation character. + +=item xdigit + + Any hexadecimal digit. Though this may feel silly + (/0-9a-f/i would work just fine) it is included + for completeness. + +=item + +=back + +You can negate the [::] character classes by prefixing the class name +with a '^'. This is a Perl extension. For example: + + ^digit \D \P{IsDigit} + ^space \S \P{IsSpace} + ^word \W \P{IsWord} + +The POSIX character classes [.cc.] and [=cc=] are B supported +and trying to use them will cause an error. + Perl defines the following zero-width assertions: \b Match a word boundary diff --git a/proto.h b/proto.h index 7fa6424..402876a 100644 --- a/proto.h +++ b/proto.h @@ -196,35 +196,53 @@ VIRTUAL char* Perl_instr(pTHX_ const char* big, const char* little); VIRTUAL bool Perl_io_close(pTHX_ IO* io); VIRTUAL OP* Perl_invert(pTHX_ OP* cmd); VIRTUAL bool Perl_is_uni_alnum(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_alnumc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_idfirst(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_alpha(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_ascii(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_space(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_cntrl(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_graph(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_digit(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_upper(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_lower(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_print(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_punct(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_xdigit(pTHX_ U32 c); VIRTUAL U32 Perl_to_uni_upper(pTHX_ U32 c); VIRTUAL U32 Perl_to_uni_title(pTHX_ U32 c); VIRTUAL U32 Perl_to_uni_lower(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_alnum_lc(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_alnumc_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_idfirst_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_alpha_lc(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_ascii_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_space_lc(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_cntrl_lc(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_graph_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_digit_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_upper_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_lower_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_uni_print_lc(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_punct_lc(pTHX_ U32 c); +VIRTUAL bool Perl_is_uni_xdigit_lc(pTHX_ U32 c); VIRTUAL U32 Perl_to_uni_upper_lc(pTHX_ U32 c); VIRTUAL U32 Perl_to_uni_title_lc(pTHX_ U32 c); VIRTUAL U32 Perl_to_uni_lower_lc(pTHX_ U32 c); VIRTUAL bool Perl_is_utf8_alnum(pTHX_ U8 *p); +VIRTUAL bool Perl_is_utf8_alnumc(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_idfirst(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_alpha(pTHX_ U8 *p); +VIRTUAL bool Perl_is_utf8_ascii(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_space(pTHX_ U8 *p); +VIRTUAL bool Perl_is_utf8_cntrl(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_digit(pTHX_ U8 *p); +VIRTUAL bool Perl_is_utf8_graph(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_upper(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_lower(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_print(pTHX_ U8 *p); +VIRTUAL bool Perl_is_utf8_punct(pTHX_ U8 *p); +VIRTUAL bool Perl_is_utf8_xdigit(pTHX_ U8 *p); VIRTUAL bool Perl_is_utf8_mark(pTHX_ U8 *p); VIRTUAL OP* Perl_jmaybe(pTHX_ OP* arg); VIRTUAL I32 Perl_keyword(pTHX_ char* d, I32 len); @@ -854,7 +872,8 @@ STATIC void S_scan_commit(pTHX_ scan_data_t *data); STATIC I32 S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags); STATIC I32 S_add_data(pTHX_ I32 n, char *s); STATIC void S_re_croak2(pTHX_ const char* pat1, const char* pat2, ...) __attribute__((noreturn)); -STATIC char* S_regpposixcc(pTHX_ I32 value); +STATIC I32 S_regpposixcc(pTHX_ I32 value); +STATIC void S_checkposixcc(pTHX); STATIC void S_clear_re(pTHX_ void *r); #endif #if defined(PERL_IN_REGEXEC_C) || defined(PERL_DECL_PROT) diff --git a/regcomp.c b/regcomp.c index 59fe5a7..3569b3b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -163,6 +163,9 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define LOC (PL_regflags & PMf_LOCALE) #define FOLD (PL_regflags & PMf_FOLD) +#define OOB_CHAR8 1234 +#define OOB_UTF8 123456 + #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) @@ -2093,12 +2096,17 @@ S_regwhite(pTHX_ char *p, char *e) return p; } -/* parse POSIX character classes like [[:foo:]] */ -STATIC char* +/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. + Character classes ([:foo:]) can also be negated ([:^foo:]). + Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. + Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, + but trigger warnings because they are currently unimplemented. */ +STATIC I32 S_regpposixcc(pTHX_ I32 value) { dTHR; char *posixcc = 0; + I32 namedclass = -1; if (value == '[' && PL_regcomp_parse + 1 < PL_regxend && /* I smell either [: or [= or [. -- POSIX has been here, right? */ @@ -2114,26 +2122,120 @@ S_regpposixcc(pTHX_ I32 value) /* Grandfather lone [:, [=, [. */ PL_regcomp_parse = s; else { - PL_regcomp_parse++; /* skip over the c */ - if (*PL_regcomp_parse == ']') { - /* Not Implemented Yet. - * (POSIX Extended Character Classes, that is) - * The text between e.g. [: and :] would start - * at s + 1 and stop at regcomp_parse - 2. */ - if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY) + char* t = PL_regcomp_parse++; /* skip over the c */ + + if (*PL_regcomp_parse == ']') { + PL_regcomp_parse++; /* skip over the ending ] */ + posixcc = s + 1; + if (*s == ':') { + I32 complement = *posixcc == '^' ? *posixcc++ : 0; + I32 skip = 5; /* the most common skip */ + + switch (*posixcc) { + case 'a': + if (strnEQ(posixcc, "alnum", 5)) + namedclass = + complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; + else if (strnEQ(posixcc, "alpha", 5)) + namedclass = + complement ? ANYOF_NALPHA : ANYOF_ALPHA; + else if (strnEQ(posixcc, "ascii", 5)) + namedclass = + complement ? ANYOF_NASCII : ANYOF_ASCII; + break; + case 'c': + if (strnEQ(posixcc, "cntrl", 5)) + namedclass = + complement ? ANYOF_NCNTRL : ANYOF_CNTRL; + break; + case 'd': + if (strnEQ(posixcc, "digit", 5)) + namedclass = + complement ? ANYOF_NDIGIT : ANYOF_DIGIT; + break; + case 'g': + if (strnEQ(posixcc, "graph", 5)) + namedclass = + complement ? ANYOF_NGRAPH : ANYOF_GRAPH; + break; + case 'l': + if (strnEQ(posixcc, "lower", 5)) + namedclass = + complement ? ANYOF_NLOWER : ANYOF_LOWER; + break; + case 'p': + if (strnEQ(posixcc, "print", 5)) + namedclass = + complement ? ANYOF_NPRINT : ANYOF_PRINT; + else if (strnEQ(posixcc, "punct", 5)) + namedclass = + complement ? ANYOF_NPUNCT : ANYOF_PUNCT; + break; + case 's': + if (strnEQ(posixcc, "space", 5)) + namedclass = + complement ? ANYOF_NSPACE : ANYOF_SPACE; + case 'u': + if (strnEQ(posixcc, "upper", 5)) + namedclass = + complement ? ANYOF_NUPPER : ANYOF_UPPER; + break; + case 'w': /* this is not POSIX, this is the Perl \w */ + if (strnEQ(posixcc, "word", 4)) { + namedclass = + complement ? ANYOF_NALNUM : ANYOF_ALNUM; + skip = 4; + } + break; + case 'x': + if (strnEQ(posixcc, "xdigit", 6)) { + namedclass = + complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; + skip = 6; + } + break; + } + if ((namedclass == -1 || + !(posixcc + skip + 2 < PL_regxend && + (posixcc[skip] == ':' && + posixcc[skip + 1] == ']')))) + Perl_croak(aTHX_ "Character class [:%.*s:] unknown", + t - s - 1, s + 1); + } else if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY) + /* [[=foo=]] and [[.foo.]] are still future. */ Perl_warner(aTHX_ WARN_UNSAFE, - "Character class syntax [%c %c] is reserved for future extensions", c, c); - PL_regcomp_parse++; /* skip over the ending ] */ - posixcc = s + 1; - } - else { - /* maternal grandfather */ + "Character class syntax [%c %c] is reserved for future extensions", c, c); + } else { + /* Maternal grandfather: + * "[:" ending in ":" but not in ":]" */ PL_regcomp_parse = s; } } } - return posixcc; + return namedclass; +} + +STATIC void +S_checkposixcc(pTHX) +{ + if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY && + (*PL_regcomp_parse == ':' || + *PL_regcomp_parse == '=' || + *PL_regcomp_parse == '.')) { + char *s = PL_regcomp_parse; + char c = *s++; + + while(*s && isALNUM(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + Perl_warner(aTHX_ WARN_UNSAFE, + "Character class syntax [%c %c] belongs inside character classes", c, c); + if (c == '=' || c == '.') + Perl_warner(aTHX_ WARN_UNSAFE, + "Character class syntax [%c %c] is reserved for future extensions", c, c); + } + } } STATIC regnode * @@ -2142,142 +2244,319 @@ S_regclass(pTHX) dTHR; register char *opnd, *s; register I32 value; - register I32 lastvalue = 1234; + register I32 lastvalue = OOB_CHAR8; register I32 range = 0; register regnode *ret; register I32 def; I32 numlen; + I32 namedclass; s = opnd = (char *) OPERAND(PL_regcode); ret = reg_node(ANYOF); - for (value = 0; value < 33; value++) + for (value = 0; value < ANYOF_SIZE; value++) regc(0, s++); if (*PL_regcomp_parse == '^') { /* Complement of range. */ PL_regnaughty++; PL_regcomp_parse++; if (!SIZE_ONLY) - *opnd |= ANYOF_INVERT; + ANYOF_FLAGS(opnd) |= ANYOF_INVERT; } if (!SIZE_ONLY) { PL_regcode += ANY_SKIP; if (FOLD) - *opnd |= ANYOF_FOLD; + ANYOF_FLAGS(opnd) |= ANYOF_FOLD; if (LOC) - *opnd |= ANYOF_LOCALE; + ANYOF_FLAGS(opnd) |= ANYOF_LOCALE; } else { PL_regsize += ANY_SKIP; } + + checkposixcc(); + if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') goto skipcond; /* allow 1st char to be ] or - */ while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { skipcond: + namedclass = -1; value = UCHARAT(PL_regcomp_parse++); if (value == '[') - (void)regpposixcc(value); /* ignore the return value for now */ + namedclass = regpposixcc(value); else if (value == '\\') { value = UCHARAT(PL_regcomp_parse++); switch (value) { - case 'w': - if (!SIZE_ONLY) { - if (LOC) - *opnd |= ANYOF_ALNUML; - else { - for (value = 0; value < 256; value++) - if (isALNUM(value)) - ANYOF_SET(opnd, value); - } + case 'w': namedclass = ANYOF_ALNUM; break; + case 'W': namedclass = ANYOF_NALNUM; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = '\033'; break; + case 'a': value = '\007'; break; + case 'x': + value = scan_hex(PL_regcomp_parse, 2, &numlen); + PL_regcomp_parse += numlen; + break; + case 'c': + value = UCHARAT(PL_regcomp_parse++); + value = toCTRL(value); + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + value = scan_oct(--PL_regcomp_parse, 3, &numlen); + PL_regcomp_parse += numlen; + break; + } + } + if (!SIZE_ONLY && namedclass > -1) { + switch (namedclass) { + case ANYOF_ALNUM: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_ALNUM); + else { + for (value = 0; value < 256; value++) + if (isALNUM(value)) + ANYOF_BITMAP_SET(opnd, value); } - lastvalue = 1234; - continue; - case 'W': - if (!SIZE_ONLY) { - if (LOC) - *opnd |= ANYOF_NALNUML; - else { - for (value = 0; value < 256; value++) - if (!isALNUM(value)) - ANYOF_SET(opnd, value); - } + break; + case ANYOF_NALNUM: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NALNUM); + else { + for (value = 0; value < 256; value++) + if (!isALNUM(value)) + ANYOF_BITMAP_SET(opnd, value); } - lastvalue = 1234; - continue; - case 's': - if (!SIZE_ONLY) { - if (LOC) - *opnd |= ANYOF_SPACEL; - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_SET(opnd, value); - } + break; + case ANYOF_SPACE: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_SPACE); + else { + for (value = 0; value < 256; value++) + if (isSPACE(value)) + ANYOF_BITMAP_SET(opnd, value); } - lastvalue = 1234; - continue; - case 'S': - if (!SIZE_ONLY) { - if (LOC) - *opnd |= ANYOF_NSPACEL; - else { - for (value = 0; value < 256; value++) - if (!isSPACE(value)) - ANYOF_SET(opnd, value); - } + break; + case ANYOF_NSPACE: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NSPACE); + else { + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_SET(opnd, value); } - lastvalue = 1234; - continue; - case 'd': - if (!SIZE_ONLY) { + break; + case ANYOF_DIGIT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_DIGIT); + else { for (value = '0'; value <= '9'; value++) - ANYOF_SET(opnd, value); + ANYOF_BITMAP_SET(opnd, value); } - lastvalue = 1234; - continue; - case 'D': - if (!SIZE_ONLY) { + break; + case ANYOF_NDIGIT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NDIGIT); + else { for (value = 0; value < '0'; value++) - ANYOF_SET(opnd, value); + ANYOF_BITMAP_SET(opnd, value); for (value = '9' + 1; value < 256; value++) - ANYOF_SET(opnd, value); + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_NALNUMC: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NALNUMC); + else { + for (value = 0; value < 256; value++) + if (!isALNUMC(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_ALNUMC: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_ALNUMC); + else { + for (value = 0; value < 256; value++) + if (isALNUMC(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_ALPHA: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_ALPHA); + else { + for (value = 0; value < 256; value++) + if (isALPHA(value)) + ANYOF_BITMAP_SET(opnd, value); } - lastvalue = 1234; - continue; - case 'n': - value = '\n'; break; - case 'r': - value = '\r'; + case ANYOF_NALPHA: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NALPHA); + else { + for (value = 0; value < 256; value++) + if (!isALPHA(value)) + ANYOF_BITMAP_SET(opnd, value); + } break; - case 't': - value = '\t'; + case ANYOF_ASCII: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_ASCII); + else { + for (value = 0; value < 128; value++) + ANYOF_BITMAP_SET(opnd, value); + } break; - case 'f': - value = '\f'; + case ANYOF_NASCII: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NASCII); + else { + for (value = 128; value < 256; value++) + ANYOF_BITMAP_SET(opnd, value); + } break; - case 'b': - value = '\b'; + case ANYOF_CNTRL: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_CNTRL); + else { + for (value = 0; value < 256; value++) + if (isCNTRL(value)) + ANYOF_BITMAP_SET(opnd, value); + } + lastvalue = OOB_CHAR8; break; - case 'e': - value = '\033'; + case ANYOF_NCNTRL: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NCNTRL); + else { + for (value = 0; value < 256; value++) + if (!isCNTRL(value)) + ANYOF_BITMAP_SET(opnd, value); + } break; - case 'a': - value = '\007'; + case ANYOF_GRAPH: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_GRAPH); + else { + for (value = 0; value < 256; value++) + if (isGRAPH(value)) + ANYOF_BITMAP_SET(opnd, value); + } break; - case 'x': - value = scan_hex(PL_regcomp_parse, 2, &numlen); - PL_regcomp_parse += numlen; + case ANYOF_NGRAPH: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NGRAPH); + else { + for (value = 0; value < 256; value++) + if (!isGRAPH(value)) + ANYOF_BITMAP_SET(opnd, value); + } break; - case 'c': - value = UCHARAT(PL_regcomp_parse++); - value = toCTRL(value); + case ANYOF_LOWER: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_LOWER); + else { + for (value = 0; value < 256; value++) + if (isLOWER(value)) + ANYOF_BITMAP_SET(opnd, value); + } break; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - value = scan_oct(--PL_regcomp_parse, 3, &numlen); - PL_regcomp_parse += numlen; + case ANYOF_NLOWER: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NLOWER); + else { + for (value = 0; value < 256; value++) + if (!isLOWER(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_PRINT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_PRINT); + else { + for (value = 0; value < 256; value++) + if (isPRINT(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_NPRINT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NPRINT); + else { + for (value = 0; value < 256; value++) + if (!isPRINT(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_PUNCT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_PUNCT); + else { + for (value = 0; value < 256; value++) + if (isPUNCT(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_NPUNCT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NPUNCT); + else { + for (value = 0; value < 256; value++) + if (!isPUNCT(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_UPPER: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_UPPER); + else { + for (value = 0; value < 256; value++) + if (isUPPER(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_NUPPER: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NUPPER); + else { + for (value = 0; value < 256; value++) + if (!isUPPER(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_XDIGIT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_XDIGIT); + else { + for (value = 0; value < 256; value++) + if (isXDIGIT(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + case ANYOF_NXDIGIT: + if (LOC) + ANYOF_CLASS_SET(opnd, ANYOF_NXDIGIT); + else { + for (value = 0; value < 256; value++) + if (!isXDIGIT(value)) + ANYOF_BITMAP_SET(opnd, value); + } + break; + default: + FAIL("invalid [::] class in regexp"); break; } + if (LOC) + ANYOF_FLAGS(opnd) |= ANYOF_CLASS; + lastvalue = OOB_CHAR8; } + else if (range) { if (lastvalue > value) FAIL("invalid [] range in regexp"); @@ -2301,35 +2580,36 @@ S_regclass(pTHX) if (isLOWER(lastvalue)) { for (i = lastvalue; i <= value; i++) if (isLOWER(i)) - ANYOF_SET(opnd, i); + ANYOF_BITMAP_SET(opnd, i); } else { for (i = lastvalue; i <= value; i++) if (isUPPER(i)) - ANYOF_SET(opnd, i); + ANYOF_BITMAP_SET(opnd, i); } } else #endif for ( ; lastvalue <= value; lastvalue++) - ANYOF_SET(opnd, lastvalue); + ANYOF_BITMAP_SET(opnd, lastvalue); } lastvalue = value; } /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ - if (!SIZE_ONLY && (*opnd & (0xFF ^ ANYOF_INVERT)) == ANYOF_FOLD) { + if (!SIZE_ONLY && + (ANYOF_FLAGS(opnd) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) { for (value = 0; value < 256; ++value) { - if (ANYOF_TEST(opnd, value)) { + if (ANYOF_BITMAP_TEST(opnd, value)) { I32 cf = PL_fold[value]; - ANYOF_SET(opnd, cf); + ANYOF_BITMAP_SET(opnd, cf); } } - *opnd &= ~ANYOF_FOLD; + ANYOF_FLAGS(opnd) &= ~ANYOF_FOLD; } /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && (*opnd & 0xFF) == ANYOF_INVERT) { - for (value = 0; value < 32; ++value) - opnd[1 + value] ^= 0xFF; - *opnd = 0; + if (!SIZE_ONLY && (ANYOF_FLAGS(opnd) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { + for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) + opnd[ANYOF_BITMAP_OFFSET + value] ^= ANYOF_FLAGS_ALL; + ANYOF_FLAGS(opnd) = 0; } return ret; } @@ -2337,16 +2617,17 @@ S_regclass(pTHX) STATIC regnode * S_regclassutf8(pTHX) { + dTHR; register char *opnd, *e; register U32 value; - register U32 lastvalue = 123456; + register U32 lastvalue = OOB_UTF8; register I32 range = 0; register regnode *ret; I32 numlen; I32 n; SV *listsv; U8 flags = 0; - dTHR; + I32 namedclass; if (*PL_regcomp_parse == '^') { /* Complement of range. */ PL_regnaughty++; @@ -2362,75 +2643,29 @@ S_regclassutf8(pTHX) listsv = newSVpvn("# comment\n",10); } + checkposixcc(); + if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') goto skipcond; /* allow 1st char to be ] or - */ while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { skipcond: + namedclass = -1; value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); PL_regcomp_parse += numlen; if (value == '[') - (void)regpposixcc(value); /* ignore the return value for now */ + namedclass = regpposixcc(value); else if (value == '\\') { value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); PL_regcomp_parse += numlen; switch (value) { - case 'w': - if (!SIZE_ONLY) { - if (LOC) - flags |= ANYOF_ALNUML; - - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); - } - lastvalue = 123456; - continue; - case 'W': - if (!SIZE_ONLY) { - if (LOC) - flags |= ANYOF_NALNUML; - - Perl_sv_catpvf(aTHX_ listsv, - "-utf8::IsAlpha\n-utf8::IsDigit\n0000\t%04x\n%04x\tffff\n", - '_' - 1, - '_' + 1); - } - lastvalue = 123456; - continue; - case 's': - if (!SIZE_ONLY) { - if (LOC) - flags |= ANYOF_SPACEL; - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); - if (!PL_utf8_space) - is_utf8_space((U8*)" "); - } - lastvalue = 123456; - continue; - case 'S': - if (!SIZE_ONLY) { - if (LOC) - flags |= ANYOF_NSPACEL; - Perl_sv_catpvf(aTHX_ listsv, - "!utf8::IsSpace\n"); - if (!PL_utf8_space) - is_utf8_space((U8*)" "); - } - lastvalue = 123456; - continue; - case 'd': - if (!SIZE_ONLY) { - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); - } - lastvalue = 123456; - continue; - case 'D': - if (!SIZE_ONLY) { - Perl_sv_catpvf(aTHX_ listsv, - "!utf8::IsDigit\n"); - } - lastvalue = 123456; - continue; + case 'w': namedclass = ANYOF_ALNUM; break; + case 'W': namedclass = ANYOF_NALNUM; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; case 'p': case 'P': if (*PL_regcomp_parse == '{') { @@ -2445,41 +2680,30 @@ S_regclassutf8(pTHX) } if (!SIZE_ONLY) { if (value == 'p') - Perl_sv_catpvf(aTHX_ listsv, "+utf8::%.*s\n", n, PL_regcomp_parse); + Perl_sv_catpvf(aTHX_ listsv, + "+utf8::%.*s\n", n, PL_regcomp_parse); else Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", n, PL_regcomp_parse); + "!utf8::%.*s\n", n, PL_regcomp_parse); } PL_regcomp_parse = e + 1; - lastvalue = 123456; + lastvalue = OOB_UTF8; continue; - case 'n': - value = '\n'; - break; - case 'r': - value = '\r'; - break; - case 't': - value = '\t'; - break; - case 'f': - value = '\f'; - break; - case 'b': - value = '\b'; - break; - case 'e': - value = '\033'; - break; - case 'a': - value = '\007'; - break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = '\033'; break; + case 'a': value = '\007'; break; case 'x': if (*PL_regcomp_parse == '{') { e = strchr(PL_regcomp_parse++, '}'); if (!e) FAIL("Missing right brace on \\x{}"); - value = scan_hex(PL_regcomp_parse, e - PL_regcomp_parse, &numlen); + value = scan_hex(PL_regcomp_parse, + e - PL_regcomp_parse, + &numlen); PL_regcomp_parse = e + 1; } else { @@ -2498,7 +2722,64 @@ S_regclassutf8(pTHX) break; } } - if (range) { + if (!SIZE_ONLY && namedclass > -1) { + switch (namedclass) { + case ANYOF_ALNUM: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; + case ANYOF_NALNUM: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; + case ANYOF_ALNUMC: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; + case ANYOF_NALNUMC: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; + case ANYOF_ALPHA: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; + case ANYOF_NALPHA: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; + case ANYOF_ASCII: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; + case ANYOF_NASCII: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; + case ANYOF_CNTRL: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; + case ANYOF_NCNTRL: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; + case ANYOF_GRAPH: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; + case ANYOF_NGRAPH: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; + case ANYOF_DIGIT: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; + case ANYOF_NDIGIT: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; + case ANYOF_LOWER: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; + case ANYOF_NLOWER: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; + case ANYOF_PRINT: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; + case ANYOF_NPRINT: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; + case ANYOF_PUNCT: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; + case ANYOF_NPUNCT: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; + case ANYOF_SPACE: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; + case ANYOF_NSPACE: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; + case ANYOF_UPPER: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; + case ANYOF_NUPPER: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; + case ANYOF_XDIGIT: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; + case ANYOF_NXDIGIT: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; + } + } + else + if (range) { if (lastvalue > value) FAIL("invalid [] range in regexp"); if (!SIZE_ONLY) diff --git a/regcomp.h b/regcomp.h index 518add0..c679ca4 100644 --- a/regcomp.h +++ b/regcomp.h @@ -154,24 +154,76 @@ struct regnode_2 { #define SIZE_ONLY (PL_regcode == &PL_regdummy) -/* Flags for first parameter byte of ANYOF */ -#define ANYOF_INVERT 0x40 -#define ANYOF_FOLD 0x20 -#define ANYOF_LOCALE 0x10 -#define ANYOF_ISA 0x0F -#define ANYOF_ALNUML 0x08 -#define ANYOF_NALNUML 0x04 -#define ANYOF_SPACEL 0x02 -#define ANYOF_NSPACEL 0x01 - -/* Utility macros for bitmap of ANYOF */ -#define ANYOF_BYTE(p,c) (p)[1 + (((c) >> 3) & 31)] -#define ANYOF_BIT(c) (1 << ((c) & 7)) -#define ANYOF_SET(p,c) (ANYOF_BYTE(p,c) |= ANYOF_BIT(c)) -#define ANYOF_CLEAR(p,c) (ANYOF_BYTE(p,c) &= ~ANYOF_BIT(c)) -#define ANYOF_TEST(p,c) (ANYOF_BYTE(p,c) & ANYOF_BIT(c)) - -#define ANY_SKIP ((33 - 1)/sizeof(regnode) + 1) +/* Flags for first parameter byte [0] of ANYOF */ + +#define ANYOF_CLASS 0x08 +#define ANYOF_INVERT 0x04 +#define ANYOF_FOLD 0x02 +#define ANYOF_LOCALE 0x01 + +/* Character classes for bytes [1..4] of ANYOF */ + +#define ANYOF_ALNUM 0 /* \w, utf8::IsWord, isALNUM() */ +#define ANYOF_NALNUM 1 +#define ANYOF_SPACE 2 +#define ANYOF_NSPACE 3 +#define ANYOF_DIGIT 4 +#define ANYOF_NDIGIT 5 +#define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, isALNUMC() */ +#define ANYOF_NALNUMC 7 +#define ANYOF_ALPHA 8 +#define ANYOF_NALPHA 9 +#define ANYOF_ASCII 10 +#define ANYOF_NASCII 11 +#define ANYOF_CNTRL 12 +#define ANYOF_NCNTRL 13 +#define ANYOF_GRAPH 14 +#define ANYOF_NGRAPH 15 +#define ANYOF_LOWER 16 +#define ANYOF_NLOWER 17 +#define ANYOF_PRINT 18 +#define ANYOF_NPRINT 19 +#define ANYOF_PUNCT 20 +#define ANYOF_NPUNCT 21 +#define ANYOF_UPPER 22 +#define ANYOF_NUPPER 23 +#define ANYOF_XDIGIT 24 +#define ANYOF_NXDIGIT 25 + +#define ANYOF_MAX 31 + +/* Backward source code compatibility. */ + +#define ANYOF_ALNUML ANYOF_ALNUM +#define ANYOF_NALNUML ANYOF_NALNUM +#define ANYOF_SPACEL ANYOF_SPACE +#define ANYOF_NSPACEL ANYOF_NSPACE + +/* Utility macros for the bitmap and classes of ANYOF */ + +#define ANYOF_OPND_SIZE 1 +#define ANYOF_CLASS_SIZE 4 +#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ +#define ANYOF_SIZE (ANYOF_OPND_SIZE+ANYOF_CLASS_SIZE+ANYOF_BITMAP_SIZE) + +#define ANYOF_FLAGS(p) ((p)[0]) +#define ANYOF_FLAGS_ALL 0xff + +#define ANYOF_BIT(c) (1 << ((c) & 7)) + +#define ANYOF_CLASS_OFFSET ANYOF_OPND_SIZE +#define ANYOF_CLASS_BYTE(p, c) ((p)[ANYOF_CLASS_OFFSET + (((c) >> 3) & 3)]) +#define ANYOF_CLASS_SET(p, c) (ANYOF_CLASS_BYTE(p, c) |= ANYOF_BIT(c)) +#define ANYOF_CLASS_CLEAR(p, c) (ANYOF_CLASS_BYTE(p, c) &= ~ANYOF_BIT(c)) +#define ANYOF_CLASS_TEST(p, c) (ANYOF_CLASS_BYTE(p, c) & ANYOF_BIT(c)) + +#define ANYOF_BITMAP_OFFSET (ANYOF_CLASS_OFFSET+ANYOF_CLASS_SIZE) +#define ANYOF_BITMAP_BYTE(p, c) ((p)[ANYOF_BITMAP_OFFSET + (((c) >> 3) & 31)]) +#define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) +#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) +#define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) + +#define ANY_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode) + 1) /* * Utility definitions. diff --git a/regcomp.sym b/regcomp.sym index 1391dfb..4e5c1c1 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -50,8 +50,80 @@ NSPACEL NSPACE, no Match any non-whitespace char in locale NSPACELUTF8 NSPACE, no Match any non-whitespace char in locale DIGIT DIGIT, no Match any numeric character DIGITUTF8 DIGIT, no Match any numeric character +DIGITL DIGIT, no Match any numeric character in locale +DIGITLUTF8 DIGIT, no Match any numeric character in locale NDIGIT NDIGIT, no Match any non-numeric character NDIGITUTF8 NDIGIT, no Match any non-numeric character +NDIGITL NDIGIT, no Match any non-numeric character in locale +NDIGITLUTF8 NDIGIT, no Match any non-numeric character in locale +ALNUMC ALNUMC, no Match any alphanumeric character +ALNUMCUTF8 ALNUMC, no Match any alphanumeric character +ALNUMCL ALNUMC, no Match any alphanumeric character in locale +ALNUMCLUTF8 ALNUMC, no Match any alphanumeric character in locale +NALNUMC NALNUMC, no Match any non-alphanumeric character +NALNUMCUTF8 NALNUMC, no Match any non-alphanumeric character +NALNUMCL NALNUMC, no Match any non-alphanumeric character in locale +NALNUMCLUTF8 NALNUMC, no Match any non-alphanumeric character in locale +ALPHA ALPHA, no Match any alphabetic character +ALPHAUTF8 ALPHA, no Match any alphabetic character +ALPHAL ALPHA, no Match any alphabetic character in locale +ALPHALUTF8 ALPHA, no Match any alphabetic character in locale +NALPHA NALPHA, no Match any non-alphabetic character +NALPHAUTF8 NALPHA, no Match any non-alphabetic character +NALPHAL NALPHA, no Match any non-alphabetic character in locale +NALPHALUTF8 NALPHA, no Match any non-alphabetic character in locale +ASCII ASCII, no Match any ASCII character +NASCII NASCII, no Match any non-ASCII character +CNTRL CNTRL, no Match any control character +CNTRLUTF8 CNTRL, no Match any control character +CNTRLL CNTRL, no Match any control character in locale +CNTRLLUTF8 CNTRL, no Match any control character in locale +NCNTRL NCNTRL, no Match any non-control character +NCNTRLUTF8 NCNTRL, no Match any non-control character +NCNTRLL NCNTRL, no Match any non-control character in locale +NCNTRLLUTF8 NCNTRL, no Match any non-control character in locale +GRAPH GRAPH, no Match any graphical character +GRAPHUTF8 GRAPH, no Match any graphical character +GRAPHL GRAPH, no Match any graphical character in locale +GRAPHLUTF8 GRAPH, no Match any graphical character in locale +NGRAPH NGRAPH, no Match any non-graphical character +NGRAPHUTF8 NGRAPH, no Match any non-graphical character +NGRAPHL NGRAPH, no Match any non-graphical character in locale +NGRAPHLUTF8 NGRAPH, no Match any non-graphical character in locale +LOWER LOWER, no Match any lowercase character +LOWERUTF8 LOWER, no Match any lowercase character +LOWERL LOWER, no Match any lowercase character in locale +LOWERLUTF8 LOWER, no Match any lowercase character in locale +NLOWER NLOWER, no Match any non-lowercase character +NLOWERUTF8 NLOWER, no Match any non-lowercase character +NLOWERL NLOWER, no Match any non-lowercase character in locale +NLOWERLUTF8 NLOWER, no Match any non-lowercase character in locale +PRINT PRINT, no Match any printable character +PRINTUTF8 PRINT, no Match any printable character +PRINTL PRINT, no Match any printable character in locale +PRINTLUTF8 PRINT, no Match any printable character in locale +NPRINT NPRINT, no Match any non-printable character +NPRINTUTF8 NPRINT, no Match any non-printable character +NPRINTL NPRINT, no Match any non-printable character in locale +NPRINTLUTF8 NPRINT, no Match any non-printable character in locale +PUNCT PUNCT, no Match any punctuation character +PUNCTUTF8 PUNCT, no Match any punctuation character +PUNCTL PUNCT, no Match any punctuation character in locale +PUNCTLUTF8 PUNCT, no Match any punctuation character in locale +NPUNCT NPUNCT, no Match any non-punctuation character +NPUNCTUTF8 NPUNCT, no Match any non-punctuation character +NPUNCTL NPUNCT, no Match any non-punctuation character in locale +NPUNCTLUTF8 NPUNCT, no Match any non-punctuation character in locale +UPPER UPPER, no Match any uppercase character +UPPERUTF8 UPPER, no Match any uppercase character +UPPERL UPPER, no Match any uppercase character in locale +UPPERLUTF8 UPPER, no Match any uppercase character in locale +NUPPER NUPPER, no Match any non-uppercase character +NUPPERUTF8 NUPPER, no Match any non-uppercase character +NUPPERL NUPPER, no Match any non-uppercase character in locale +NUPPERLUTF8 NUPPER, no Match any non-uppercase character in locale +XDIGIT XDIGIT, no Match any hexdigit character +NXDIGIT NXDIGIT, no Match any non-hexdigit character CLUMP CLUMP, no Match any combining character sequence # BRANCH The set of branches constituting a single choice are hooked diff --git a/regexec.c b/regexec.c index c97f89e..75f3873 100644 --- a/regexec.c +++ b/regexec.c @@ -97,7 +97,7 @@ * Forwards. */ -#define REGINCLASS(p,c) (*(p) ? reginclass(p,c) : ANYOF_TEST(p,c)) +#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c)) #define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p)) #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) @@ -1062,6 +1062,34 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * s += UTF8SKIP(s); } break; + case DIGITL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isDIGIT_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case DIGITLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isDIGIT_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; case NDIGIT: while (s < strend) { if (!isDIGIT(*s)) { @@ -1088,197 +1116,1033 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * s += UTF8SKIP(s); } break; - } - } - else { - dontbother = 0; - if (prog->float_substr != Nullsv) { /* Trim the end. */ - char *last; - I32 oldpos = scream_pos; - - if (flags & REXEC_SCREAM) { - last = screaminstr(sv, prog->float_substr, s - strbeg, - end_shift, &scream_pos, 1); /* last one */ - if (!last) - last = scream_olds; /* Only one occurence. */ + case NDIGITL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isDIGIT_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; } - else { - STRLEN len; - char *little = SvPV(prog->float_substr, len); - - if (SvTAIL(prog->float_substr)) { - if (memEQ(strend - len + 1, little, len - 1)) - last = strend - len + 1; - else if (!PL_multiline) - last = memEQ(strend - len, little, len) - ? strend - len : Nullch; + break; + case NDIGITLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isDIGIT_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; else - goto find_last; - } else { - find_last: - if (len) - last = rninstr(s, strend, little, little + len); + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case ALNUMC: + while (s < strend) { + if (isALNUMC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; else - last = strend; /* matching `$' */ + tmp = doevery; } + else + tmp = 1; + s++; } - if (last == NULL) goto phooey; /* Should not happen! */ - dontbother = strend - last + prog->float_min_offset; - } - if (minlen && (dontbother < minlen)) - dontbother = minlen - 1; - strend -= dontbother; /* this one's always in bytes! */ - /* We don't know much -- general case. */ - if (UTF) { - for (;;) { - if (regtry(prog, s)) - goto got_it; - if (s >= strend) - break; + break; + case ALNUMCUTF8: + while (s < strend) { + if (swash_fetch(PL_utf8_alnumc, (U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; s += UTF8SKIP(s); - }; - } - else { - do { - if (regtry(prog, s)) - goto got_it; - } while (s++ < strend); - } - } - - /* Failure. */ - goto phooey; - -got_it: - RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); - - if (PL_reg_eval_set) { - /* Preserve the current value of $^R */ - if (oreplsv != GvSV(PL_replgv)) - sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is - restored, the value remains - the same. */ - restore_pos(0); - } - - /* make sure $`, $&, $', and $digit will work later */ - if ( !(flags & REXEC_NOT_FIRST) ) { - if (RX_MATCH_COPIED(prog)) { - Safefree(prog->subbeg); - RX_MATCH_COPIED_off(prog); - } - if (flags & REXEC_COPY_STR) { - I32 i = PL_regeol - startpos + (stringarg - strbeg); - - s = savepvn(strbeg, i); - prog->subbeg = s; - prog->sublen = i; - RX_MATCH_COPIED_on(prog); - } - else { - prog->subbeg = strbeg; - prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ - } - } - - return 1; - -phooey: - if (PL_reg_eval_set) - restore_pos(0); - return 0; -} - -/* - - regtry - try match at specific point - */ -STATIC I32 /* 0 failure, 1 success */ -S_regtry(pTHX_ regexp *prog, char *startpos) -{ - dTHR; - register I32 i; - register I32 *sp; - register I32 *ep; - CHECKPOINT lastcp; - - if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { - MAGIC *mg; - - PL_reg_eval_set = RS_init; - DEBUG_r(DEBUG_s( - PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %i\n", - PL_stack_sp - PL_stack_base); - )); - SAVEINT(cxstack[cxstack_ix].blk_oldsp); - cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; - /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ - SAVETMPS; - /* Apparently this is not needed, judging by wantarray. */ - /* SAVEINT(cxstack[cxstack_ix].blk_gimme); - cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ - - if (PL_reg_sv) { - /* Make $_ available to executed code. */ - if (PL_reg_sv != DEFSV) { - /* SAVE_DEFSV does *not* suffice here for USE_THREADS */ - SAVESPTR(DEFSV); - DEFSV = PL_reg_sv; } - - if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) - && (mg = mg_find(PL_reg_sv, 'g')))) { - /* prepare for quick setting of pos */ - sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0); - mg = mg_find(PL_reg_sv, 'g'); - mg->mg_len = -1; + break; + case ALNUMCL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isALNUMC_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; } - PL_reg_magic = mg; - PL_reg_oldpos = mg->mg_len; - SAVEDESTRUCTOR(S_restore_pos, 0); - } - if (!PL_reg_curpm) - New(22,PL_reg_curpm, 1, PMOP); - PL_reg_curpm->op_pmregexp = prog; - PL_reg_oldcurpm = PL_curpm; - PL_curpm = PL_reg_curpm; - if (RX_MATCH_COPIED(prog)) { - /* Here is a serious problem: we cannot rewrite subbeg, - since it may be needed if this match fails. Thus - $` inside (?{}) could fail... */ - PL_reg_oldsaved = prog->subbeg; - PL_reg_oldsavedlen = prog->sublen; - RX_MATCH_COPIED_off(prog); - } - else - PL_reg_oldsaved = Nullch; - prog->subbeg = PL_bostr; - prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ - } - prog->startp[0] = startpos - PL_bostr; - PL_reginput = startpos; - PL_regstartp = prog->startp; - PL_regendp = prog->endp; - PL_reglastparen = &prog->lastparen; - prog->lastparen = 0; - PL_regsize = 0; - DEBUG_r(PL_reg_starttry = startpos); - if (PL_reg_start_tmpl <= prog->nparens) { - PL_reg_start_tmpl = prog->nparens*3/2 + 3; - if(PL_reg_start_tmp) - Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - else - New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*); - } - - /* XXXX What this code is doing here?!!! There should be no need - to do this again and again, PL_reglastparen should take care of - this! */ - sp = prog->startp; - ep = prog->endp; - if (prog->nparens) { - for (i = prog->nparens; i >= 1; i--) { - *++sp = -1; - *++ep = -1; + break; + case ALNUMCLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isALNUMC_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NALNUMC: + while (s < strend) { + if (!isALNUMC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NALNUMCUTF8: + while (s < strend) { + if (!swash_fetch(PL_utf8_alnumc, (U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NALNUMCL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isALNUMC_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NALNUMCLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isALNUMC_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case ASCII: + while (s < strend) { + if (isASCII(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NASCII: + while (s < strend) { + if (!isASCII(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case CNTRL: + while (s < strend) { + if (isCNTRL(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case CNTRLUTF8: + while (s < strend) { + if (swash_fetch(PL_utf8_cntrl,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case CNTRLL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isCNTRL_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case CNTRLLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (*s == ' ' || isCNTRL_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NCNTRL: + while (s < strend) { + if (!isCNTRL(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NCNTRLUTF8: + while (s < strend) { + if (!swash_fetch(PL_utf8_cntrl,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NCNTRLL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isCNTRL_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NCNTRLLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isCNTRL_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case GRAPH: + while (s < strend) { + if (isGRAPH(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case GRAPHUTF8: + while (s < strend) { + if (swash_fetch(PL_utf8_graph,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case GRAPHL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isGRAPH_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case GRAPHLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (*s == ' ' || isGRAPH_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NGRAPH: + while (s < strend) { + if (!isGRAPH(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NGRAPHUTF8: + while (s < strend) { + if (!swash_fetch(PL_utf8_graph,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NGRAPHL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isGRAPH_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NGRAPHLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isGRAPH_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case LOWER: + while (s < strend) { + if (isLOWER(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case LOWERUTF8: + while (s < strend) { + if (swash_fetch(PL_utf8_lower,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case LOWERL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isLOWER_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case LOWERLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (*s == ' ' || isLOWER_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NLOWER: + while (s < strend) { + if (!isLOWER(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NLOWERUTF8: + while (s < strend) { + if (!swash_fetch(PL_utf8_lower,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NLOWERL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isLOWER_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NLOWERLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isLOWER_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case PRINT: + while (s < strend) { + if (isPRINT(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case PRINTUTF8: + while (s < strend) { + if (swash_fetch(PL_utf8_print,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case PRINTL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isPRINT_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case PRINTLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (*s == ' ' || isPRINT_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NPRINT: + while (s < strend) { + if (!isPRINT(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NPRINTUTF8: + while (s < strend) { + if (!swash_fetch(PL_utf8_print,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NPRINTL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isPRINT_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NPRINTLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isPRINT_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case PUNCT: + while (s < strend) { + if (isPUNCT(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case PUNCTUTF8: + while (s < strend) { + if (swash_fetch(PL_utf8_punct,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case PUNCTL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isPUNCT_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case PUNCTLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (*s == ' ' || isPUNCT_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NPUNCT: + while (s < strend) { + if (!isPUNCT(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NPUNCTUTF8: + while (s < strend) { + if (!swash_fetch(PL_utf8_punct,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NPUNCTL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isPUNCT_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NPUNCTLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isPUNCT_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case UPPER: + while (s < strend) { + if (isUPPER(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case UPPERUTF8: + while (s < strend) { + if (swash_fetch(PL_utf8_upper,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case UPPERL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (isUPPER_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case UPPERLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (*s == ' ' || isUPPER_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NUPPER: + while (s < strend) { + if (!isUPPER(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NUPPERUTF8: + while (s < strend) { + if (!swash_fetch(PL_utf8_upper,(U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case NUPPERL: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isUPPER_LC(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NUPPERLUTF8: + PL_reg_flags |= RF_tainted; + while (s < strend) { + if (!isUPPER_LC_utf8((U8*)s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s += UTF8SKIP(s); + } + break; + case XDIGIT: + while (s < strend) { + if (isXDIGIT(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + case NXDIGIT: + while (s < strend) { + if (!isXDIGIT(*s)) { + if (tmp && regtry(prog, s)) + goto got_it; + else + tmp = doevery; + } + else + tmp = 1; + s++; + } + break; + } + } + else { + dontbother = 0; + if (prog->float_substr != Nullsv) { /* Trim the end. */ + char *last; + I32 oldpos = scream_pos; + + if (flags & REXEC_SCREAM) { + last = screaminstr(sv, prog->float_substr, s - strbeg, + end_shift, &scream_pos, 1); /* last one */ + if (!last) + last = scream_olds; /* Only one occurence. */ + } + else { + STRLEN len; + char *little = SvPV(prog->float_substr, len); + + if (SvTAIL(prog->float_substr)) { + if (memEQ(strend - len + 1, little, len - 1)) + last = strend - len + 1; + else if (!PL_multiline) + last = memEQ(strend - len, little, len) + ? strend - len : Nullch; + else + goto find_last; + } else { + find_last: + if (len) + last = rninstr(s, strend, little, little + len); + else + last = strend; /* matching `$' */ + } + } + if (last == NULL) goto phooey; /* Should not happen! */ + dontbother = strend - last + prog->float_min_offset; + } + if (minlen && (dontbother < minlen)) + dontbother = minlen - 1; + strend -= dontbother; /* this one's always in bytes! */ + /* We don't know much -- general case. */ + if (UTF) { + for (;;) { + if (regtry(prog, s)) + goto got_it; + if (s >= strend) + break; + s += UTF8SKIP(s); + }; + } + else { + do { + if (regtry(prog, s)) + goto got_it; + } while (s++ < strend); + } + } + + /* Failure. */ + goto phooey; + +got_it: + RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted); + + if (PL_reg_eval_set) { + /* Preserve the current value of $^R */ + if (oreplsv != GvSV(PL_replgv)) + sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is + restored, the value remains + the same. */ + restore_pos(0); + } + + /* make sure $`, $&, $', and $digit will work later */ + if ( !(flags & REXEC_NOT_FIRST) ) { + if (RX_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RX_MATCH_COPIED_off(prog); + } + if (flags & REXEC_COPY_STR) { + I32 i = PL_regeol - startpos + (stringarg - strbeg); + + s = savepvn(strbeg, i); + prog->subbeg = s; + prog->sublen = i; + RX_MATCH_COPIED_on(prog); + } + else { + prog->subbeg = strbeg; + prog->sublen = PL_regeol - strbeg; /* strend may have been modified */ + } + } + + return 1; + +phooey: + if (PL_reg_eval_set) + restore_pos(0); + return 0; +} + +/* + - regtry - try match at specific point + */ +STATIC I32 /* 0 failure, 1 success */ +S_regtry(pTHX_ regexp *prog, char *startpos) +{ + dTHR; + register I32 i; + register I32 *sp; + register I32 *ep; + CHECKPOINT lastcp; + + if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) { + MAGIC *mg; + + PL_reg_eval_set = RS_init; + DEBUG_r(DEBUG_s( + PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %i\n", + PL_stack_sp - PL_stack_base); + )); + SAVEINT(cxstack[cxstack_ix].blk_oldsp); + cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; + /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ + SAVETMPS; + /* Apparently this is not needed, judging by wantarray. */ + /* SAVEINT(cxstack[cxstack_ix].blk_gimme); + cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ + + if (PL_reg_sv) { + /* Make $_ available to executed code. */ + if (PL_reg_sv != DEFSV) { + /* SAVE_DEFSV does *not* suffice here for USE_THREADS */ + SAVESPTR(DEFSV); + DEFSV = PL_reg_sv; + } + + if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv) + && (mg = mg_find(PL_reg_sv, 'g')))) { + /* prepare for quick setting of pos */ + sv_magic(PL_reg_sv, (SV*)0, 'g', Nullch, 0); + mg = mg_find(PL_reg_sv, 'g'); + mg->mg_len = -1; + } + PL_reg_magic = mg; + PL_reg_oldpos = mg->mg_len; + SAVEDESTRUCTOR(S_restore_pos, 0); + } + if (!PL_reg_curpm) + New(22,PL_reg_curpm, 1, PMOP); + PL_reg_curpm->op_pmregexp = prog; + PL_reg_oldcurpm = PL_curpm; + PL_curpm = PL_reg_curpm; + if (RX_MATCH_COPIED(prog)) { + /* Here is a serious problem: we cannot rewrite subbeg, + since it may be needed if this match fails. Thus + $` inside (?{}) could fail... */ + PL_reg_oldsaved = prog->subbeg; + PL_reg_oldsavedlen = prog->sublen; + RX_MATCH_COPIED_off(prog); + } + else + PL_reg_oldsaved = Nullch; + prog->subbeg = PL_bostr; + prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ + } + prog->startp[0] = startpos - PL_bostr; + PL_reginput = startpos; + PL_regstartp = prog->startp; + PL_regendp = prog->endp; + PL_reglastparen = &prog->lastparen; + prog->lastparen = 0; + PL_regsize = 0; + DEBUG_r(PL_reg_starttry = startpos); + if (PL_reg_start_tmpl <= prog->nparens) { + PL_reg_start_tmpl = prog->nparens*3/2 + 3; + if(PL_reg_start_tmp) + Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); + else + New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*); + } + + /* XXXX What this code is doing here?!!! There should be no need + to do this again and again, PL_reglastparen should take care of + this! */ + sp = prog->startp; + ep = prog->endp; + if (prog->nparens) { + for (i = prog->nparens; i >= 1; i--) { + *++sp = -1; + *++ep = -1; } } REGCP_SET; @@ -1390,173 +2254,723 @@ S_regmatch(pTHX_ regnode *prog) /* regtill = regbol; */ break; } - sayNO; - case MBOL: - if (locinput == PL_bostr - ? PL_regprev == '\n' - : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) - { + sayNO; + case MBOL: + if (locinput == PL_bostr + ? PL_regprev == '\n' + : ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n') ) + { + break; + } + sayNO; + case SBOL: + if (locinput == PL_regbol && PL_regprev == '\n') + break; + sayNO; + case GPOS: + if (locinput == PL_reg_ganch) + break; + sayNO; + case EOL: + if (PL_multiline) + goto meol; + else + goto seol; + case MEOL: + meol: + if ((nextchr || locinput < PL_regeol) && nextchr != '\n') + sayNO; + break; + case SEOL: + seol: + if ((nextchr || locinput < PL_regeol) && nextchr != '\n') + sayNO; + if (PL_regeol - locinput > 1) + sayNO; + break; + case EOS: + if (PL_regeol != locinput) + sayNO; + break; + case SANYUTF8: + if (nextchr & 0x80) { + locinput += PL_utf8skip[nextchr]; + if (locinput > PL_regeol) + sayNO; + nextchr = UCHARAT(locinput); + break; + } + if (!nextchr && locinput >= PL_regeol) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case SANY: + if (!nextchr && locinput >= PL_regeol) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case ANYUTF8: + if (nextchr & 0x80) { + locinput += PL_utf8skip[nextchr]; + if (locinput > PL_regeol) + sayNO; + nextchr = UCHARAT(locinput); + break; + } + if (!nextchr && locinput >= PL_regeol || nextchr == '\n') + sayNO; + nextchr = UCHARAT(++locinput); + break; + case REG_ANY: + if (!nextchr && locinput >= PL_regeol || nextchr == '\n') + sayNO; + nextchr = UCHARAT(++locinput); + break; + case EXACT: + s = (char *) OPERAND(scan); + ln = UCHARAT(s++); + /* Inline the first character, for speed. */ + if (UCHARAT(s) != nextchr) + sayNO; + if (PL_regeol - locinput < ln) + sayNO; + if (ln > 1 && memNE(s, locinput, ln)) + sayNO; + locinput += ln; + nextchr = UCHARAT(locinput); + break; + case EXACTFL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case EXACTF: + s = (char *) OPERAND(scan); + ln = UCHARAT(s++); + + if (UTF) { + char *l = locinput; + char *e = s + ln; + c1 = OP(scan) == EXACTF; + while (s < e) { + if (l >= PL_regeol) + sayNO; + if (utf8_to_uv((U8*)s, 0) != (c1 ? + toLOWER_utf8((U8*)l) : + toLOWER_LC_utf8((U8*)l))) + { + sayNO; + } + s += UTF8SKIP(s); + l += UTF8SKIP(l); + } + locinput = l; + nextchr = UCHARAT(locinput); + break; + } + + /* Inline the first character, for speed. */ + if (UCHARAT(s) != nextchr && + UCHARAT(s) != ((OP(scan) == EXACTF) + ? PL_fold : PL_fold_locale)[nextchr]) + sayNO; + if (PL_regeol - locinput < ln) + sayNO; + if (ln > 1 && (OP(scan) == EXACTF + ? ibcmp(s, locinput, ln) + : ibcmp_locale(s, locinput, ln))) + sayNO; + locinput += ln; + nextchr = UCHARAT(locinput); + break; + case ANYOFUTF8: + s = (char *) OPERAND(scan); + if (!REGINCLASSUTF8(scan, (U8*)locinput)) + sayNO; + if (locinput >= PL_regeol) + sayNO; + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + case ANYOF: + s = (char *) OPERAND(scan); + if (nextchr < 0) + nextchr = UCHARAT(locinput); + if (!REGINCLASS(s, nextchr)) + sayNO; + if (!nextchr && locinput >= PL_regeol) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case ALNUML: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case ALNUM: + if (!nextchr) + sayNO; + if (!(OP(scan) == ALNUM + ? isALNUM(nextchr) : isALNUM_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case ALNUMLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case ALNUMUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (!(OP(scan) == ALNUMUTF8 + ? swash_fetch(PL_utf8_alnum, (U8*)locinput) + : isALNUM_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (!(OP(scan) == ALNUMUTF8 + ? isALNUM(nextchr) : isALNUM_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NALNUML: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NALNUM: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (OP(scan) == NALNUM + ? isALNUM(nextchr) : isALNUM_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NALNUMLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NALNUMUTF8: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (nextchr & 0x80) { + if (OP(scan) == NALNUMUTF8 + ? swash_fetch(PL_utf8_alnum, (U8*)locinput) + : isALNUM_LC_utf8((U8*)locinput)) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (OP(scan) == NALNUMUTF8 + ? isALNUM(nextchr) : isALNUM_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case BOUNDL: + case NBOUNDL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case BOUND: + case NBOUND: + /* was last char in word? */ + ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev; + if (OP(scan) == BOUND || OP(scan) == NBOUND) { + ln = isALNUM(ln); + n = isALNUM(nextchr); + } + else { + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchr); + } + if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL)) + sayNO; + break; + case BOUNDLUTF8: + case NBOUNDLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case BOUNDUTF8: + case NBOUNDUTF8: + /* was last char in word? */ + ln = (locinput != PL_regbol) + ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev; + if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { + ln = isALNUM_uni(ln); + n = swash_fetch(PL_utf8_alnum, (U8*)locinput); + } + else { + ln = isALNUM_LC_uni(ln); + n = isALNUM_LC_utf8((U8*)locinput); + } + if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8)) + sayNO; + break; + case SPACEL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case SPACE: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (!(OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case SPACELUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case SPACEUTF8: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (nextchr & 0x80) { + if (!(OP(scan) == SPACEUTF8 + ? swash_fetch(PL_utf8_space,(U8*)locinput) + : isSPACE_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (!(OP(scan) == SPACEUTF8 + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NSPACEL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NSPACE: + if (!nextchr) + sayNO; + if (OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NSPACELUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NSPACEUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (OP(scan) == NSPACEUTF8 + ? swash_fetch(PL_utf8_space,(U8*)locinput) + : isSPACE_LC_utf8((U8*)locinput)) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); break; } - sayNO; - case SBOL: - if (locinput == PL_regbol && PL_regprev == '\n') + if (OP(scan) == NSPACEUTF8 + ? isSPACE(nextchr) : isSPACE_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case DIGITL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case DIGIT: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (!(OP(scan) == DIGIT + ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case DIGITLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case DIGITUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (OP(scan) == NDIGITUTF8 + ? swash_fetch(PL_utf8_digit,(U8*)locinput) + : isDIGIT_LC_utf8((U8*)locinput)) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); break; - sayNO; - case GPOS: - if (locinput == PL_reg_ganch) + } + if (!isDIGIT(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NDIGITL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NDIGIT: + if (!nextchr) + sayNO; + if (OP(scan) == DIGIT + ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NDIGITLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NDIGITUTF8: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (nextchr & 0x80) { + if (swash_fetch(PL_utf8_digit,(U8*)locinput)) + sayNO; + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); break; - sayNO; - case EOL: - if (PL_multiline) - goto meol; - else - goto seol; - case MEOL: - meol: - if ((nextchr || locinput < PL_regeol) && nextchr != '\n') + } + if (isDIGIT(nextchr)) sayNO; + nextchr = UCHARAT(++locinput); break; - case SEOL: - seol: - if ((nextchr || locinput < PL_regeol) && nextchr != '\n') + case ALNUMCL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case ALNUMC: + if (!nextchr) sayNO; - if (PL_regeol - locinput > 1) + if (!(OP(scan) == ALNUMC + ? isALNUMC(nextchr) : isALNUMC_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case ALNUMCLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case ALNUMCUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (!(OP(scan) == ALNUMCUTF8 + ? swash_fetch(PL_utf8_alnumc, (U8*)locinput) + : isALNUMC_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (!(OP(scan) == ALNUMCUTF8 + ? isALNUMC(nextchr) : isALNUMC_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NALNUMCL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NALNUMC: + if (!nextchr) + sayNO; + if (OP(scan) == ALNUMC + ? isALNUMC(nextchr) : isALNUMC_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NALNUMCLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NALNUMCUTF8: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (nextchr & 0x80) { + if (swash_fetch(PL_utf8_alnumc,(U8*)locinput)) + sayNO; + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (isALNUMC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case ALPHAL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case ALPHA: + if (!nextchr) + sayNO; + if (!(OP(scan) == ALPHA + ? isALPHA(nextchr) : isALPHA_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case ALPHALUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case ALPHAUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (!(OP(scan) == ALPHAUTF8 + ? swash_fetch(PL_utf8_alpha, (U8*)locinput) + : isALPHA_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (!(OP(scan) == ALPHAUTF8 + ? isALPHA(nextchr) : isALPHA_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NALPHAL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NALPHA: + if (!nextchr) + sayNO; + if (OP(scan) == ALPHA + ? isALPHA(nextchr) : isALPHA_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NALPHALUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NALPHAUTF8: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (nextchr & 0x80) { + if (swash_fetch(PL_utf8_alpha,(U8*)locinput)) + sayNO; + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (isALPHA(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case ASCII: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (!isASCII(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case NASCII: + if (!nextchr && locinput >= PL_regeol) + sayNO; + if (isASCII(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case CNTRLL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case CNTRL: + if (!nextchr) + sayNO; + if (!(OP(scan) == CNTRL + ? isCNTRL(nextchr) : isCNTRL_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + break; + case CNTRLLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case CNTRLUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (!(OP(scan) == CNTRLUTF8 + ? swash_fetch(PL_utf8_cntrl, (U8*)locinput) + : isCNTRL_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (!(OP(scan) == CNTRLUTF8 + ? isCNTRL(nextchr) : isCNTRL_LC(nextchr))) sayNO; + nextchr = UCHARAT(++locinput); break; - case EOS: - if (PL_regeol != locinput) + case NCNTRLL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NCNTRL: + if (!nextchr) sayNO; + if (OP(scan) == CNTRL + ? isCNTRL(nextchr) : isCNTRL_LC(nextchr)) + sayNO; + nextchr = UCHARAT(++locinput); break; - case SANYUTF8: + case NCNTRLLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NCNTRLUTF8: + if (!nextchr && locinput >= PL_regeol) + sayNO; if (nextchr & 0x80) { - locinput += PL_utf8skip[nextchr]; - if (locinput > PL_regeol) + if (swash_fetch(PL_utf8_cntrl,(U8*)locinput)) sayNO; + locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (!nextchr && locinput >= PL_regeol) + if (isCNTRL(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case SANY: - if (!nextchr && locinput >= PL_regeol) + case GRAPHL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case GRAPH: + if (!nextchr) + sayNO; + if (!(OP(scan) == GRAPH + ? isGRAPH(nextchr) : isGRAPH_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case ANYUTF8: + case GRAPHLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case GRAPHUTF8: + if (!nextchr) + sayNO; if (nextchr & 0x80) { - locinput += PL_utf8skip[nextchr]; - if (locinput > PL_regeol) + if (!(OP(scan) == GRAPHUTF8 + ? swash_fetch(PL_utf8_graph, (U8*)locinput) + : isGRAPH_LC_utf8((U8*)locinput))) + { sayNO; + } + locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (!nextchr && locinput >= PL_regeol || nextchr == '\n') + if (!(OP(scan) == GRAPHUTF8 + ? isGRAPH(nextchr) : isGRAPH_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case REG_ANY: - if (!nextchr && locinput >= PL_regeol || nextchr == '\n') + case NGRAPHL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NGRAPH: + if (!nextchr) + sayNO; + if (OP(scan) == GRAPH + ? isGRAPH(nextchr) : isGRAPH_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case EXACT: - s = (char *) OPERAND(scan); - ln = UCHARAT(s++); - /* Inline the first character, for speed. */ - if (UCHARAT(s) != nextchr) + case NGRAPHLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NGRAPHUTF8: + if (!nextchr && locinput >= PL_regeol) sayNO; - if (PL_regeol - locinput < ln) + if (nextchr & 0x80) { + if (swash_fetch(PL_utf8_graph,(U8*)locinput)) + sayNO; + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (isGRAPH(nextchr)) sayNO; - if (ln > 1 && memNE(s, locinput, ln)) + nextchr = UCHARAT(++locinput); + break; + case LOWERL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case LOWER: + if (!nextchr) sayNO; - locinput += ln; - nextchr = UCHARAT(locinput); + if (!(OP(scan) == LOWER + ? isLOWER(nextchr) : isLOWER_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); break; - case EXACTFL: + case LOWERLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case EXACTF: - s = (char *) OPERAND(scan); - ln = UCHARAT(s++); - - if (UTF) { - char *l = locinput; - char *e = s + ln; - c1 = OP(scan) == EXACTF; - while (s < e) { - if (l >= PL_regeol) - sayNO; - if (utf8_to_uv((U8*)s, 0) != (c1 ? - toLOWER_utf8((U8*)l) : - toLOWER_LC_utf8((U8*)l))) - { - sayNO; - } - s += UTF8SKIP(s); - l += UTF8SKIP(l); + case LOWERUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (!(OP(scan) == LOWERUTF8 + ? swash_fetch(PL_utf8_lower, (U8*)locinput) + : isLOWER_LC_utf8((U8*)locinput))) + { + sayNO; } - locinput = l; + locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - - /* Inline the first character, for speed. */ - if (UCHARAT(s) != nextchr && - UCHARAT(s) != ((OP(scan) == EXACTF) - ? PL_fold : PL_fold_locale)[nextchr]) - sayNO; - if (PL_regeol - locinput < ln) - sayNO; - if (ln > 1 && (OP(scan) == EXACTF - ? ibcmp(s, locinput, ln) - : ibcmp_locale(s, locinput, ln))) + if (!(OP(scan) == LOWERUTF8 + ? isLOWER(nextchr) : isLOWER_LC(nextchr))) sayNO; - locinput += ln; - nextchr = UCHARAT(locinput); + nextchr = UCHARAT(++locinput); break; - case ANYOFUTF8: - s = (char *) OPERAND(scan); - if (!REGINCLASSUTF8(scan, (U8*)locinput)) + case NLOWERL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NLOWER: + if (!nextchr) sayNO; - if (locinput >= PL_regeol) + if (OP(scan) == LOWER + ? isLOWER(nextchr) : isLOWER_LC(nextchr)) sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); + nextchr = UCHARAT(++locinput); break; - case ANYOF: - s = (char *) OPERAND(scan); - if (nextchr < 0) - nextchr = UCHARAT(locinput); - if (!REGINCLASS(s, nextchr)) - sayNO; + case NLOWERLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NLOWERUTF8: if (!nextchr && locinput >= PL_regeol) sayNO; + if (nextchr & 0x80) { + if (swash_fetch(PL_utf8_lower,(U8*)locinput)) + sayNO; + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; + } + if (isLOWER(nextchr)) + sayNO; nextchr = UCHARAT(++locinput); break; - case ALNUML: + case PRINTL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case ALNUM: + case PRINT: if (!nextchr) sayNO; - if (!(OP(scan) == ALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr))) + if (!(OP(scan) == PRINT + ? isPRINT(nextchr) : isPRINT_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case ALNUMLUTF8: + case PRINTLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case ALNUMUTF8: + case PRINTUTF8: if (!nextchr) sayNO; if (nextchr & 0x80) { - if (!(OP(scan) == ALNUMUTF8 - ? swash_fetch(PL_utf8_alnum, (U8*)locinput) - : isALNUM_LC_utf8((U8*)locinput))) + if (!(OP(scan) == PRINTUTF8 + ? swash_fetch(PL_utf8_print, (U8*)locinput) + : isPRINT_LC_utf8((U8*)locinput))) { sayNO; } @@ -1564,137 +2978,121 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == ALNUMUTF8 - ? isALNUM(nextchr) : isALNUM_LC(nextchr))) + if (!(OP(scan) == PRINTUTF8 + ? isPRINT(nextchr) : isPRINT_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case NALNUML: + case NPRINTL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NALNUM: - if (!nextchr && locinput >= PL_regeol) + case NPRINT: + if (!nextchr) sayNO; - if (OP(scan) == NALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr)) + if (OP(scan) == PRINT + ? isPRINT(nextchr) : isPRINT_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NALNUMLUTF8: + case NPRINTLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NALNUMUTF8: + case NPRINTUTF8: if (!nextchr && locinput >= PL_regeol) sayNO; if (nextchr & 0x80) { - if (OP(scan) == NALNUMUTF8 - ? swash_fetch(PL_utf8_alnum, (U8*)locinput) - : isALNUM_LC_utf8((U8*)locinput)) - { + if (swash_fetch(PL_utf8_print,(U8*)locinput)) sayNO; - } locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NALNUMUTF8 - ? isALNUM(nextchr) : isALNUM_LC(nextchr)) + if (isPRINT(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case BOUNDL: - case NBOUNDL: + case PUNCTL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case BOUND: - case NBOUND: - /* was last char in word? */ - ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { - ln = isALNUM(ln); - n = isALNUM(nextchr); - } - else { - ln = isALNUM_LC(ln); - n = isALNUM_LC(nextchr); - } - if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL)) + case PUNCT: + if (!nextchr) + sayNO; + if (!(OP(scan) == PUNCT + ? isPUNCT(nextchr) : isPUNCT_LC(nextchr))) sayNO; + nextchr = UCHARAT(++locinput); break; - case BOUNDLUTF8: - case NBOUNDLUTF8: + case PUNCTLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case BOUNDUTF8: - case NBOUNDUTF8: - /* was last char in word? */ - ln = (locinput != PL_regbol) - ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev; - if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { - ln = isALNUM_uni(ln); - n = swash_fetch(PL_utf8_alnum, (U8*)locinput); - } - else { - ln = isALNUM_LC_uni(ln); - n = isALNUM_LC_utf8((U8*)locinput); + case PUNCTUTF8: + if (!nextchr) + sayNO; + if (nextchr & 0x80) { + if (!(OP(scan) == PUNCTUTF8 + ? swash_fetch(PL_utf8_punct, (U8*)locinput) + : isPUNCT_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; } - if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8)) + if (!(OP(scan) == PUNCTUTF8 + ? isPUNCT(nextchr) : isPUNCT_LC(nextchr))) sayNO; + nextchr = UCHARAT(++locinput); break; - case SPACEL: + case NPUNCTL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case SPACE: - if (!nextchr && locinput >= PL_regeol) + case NPUNCT: + if (!nextchr) sayNO; - if (!(OP(scan) == SPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + if (OP(scan) == PUNCT + ? isPUNCT(nextchr) : isPUNCT_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case SPACELUTF8: + case NPUNCTLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case SPACEUTF8: + case NPUNCTUTF8: if (!nextchr && locinput >= PL_regeol) sayNO; if (nextchr & 0x80) { - if (!(OP(scan) == SPACEUTF8 - ? swash_fetch(PL_utf8_space,(U8*)locinput) - : isSPACE_LC_utf8((U8*)locinput))) - { + if (swash_fetch(PL_utf8_punct,(U8*)locinput)) sayNO; - } locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == SPACEUTF8 - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + if (isPUNCT(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NSPACEL: + case UPPERL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NSPACE: + case UPPER: if (!nextchr) sayNO; - if (OP(scan) == SPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr)) + if (!(OP(scan) == UPPER + ? isUPPER(nextchr) : isUPPER_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case NSPACELUTF8: + case UPPERLUTF8: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case NSPACEUTF8: + case UPPERUTF8: if (!nextchr) sayNO; if (nextchr & 0x80) { - if (OP(scan) == NSPACEUTF8 - ? swash_fetch(PL_utf8_space,(U8*)locinput) - : isSPACE_LC_utf8((U8*)locinput)) + if (!(OP(scan) == UPPERUTF8 + ? swash_fetch(PL_utf8_upper, (U8*)locinput) + : isUPPER_LC_utf8((U8*)locinput))) { sayNO; } @@ -1702,46 +3100,50 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NSPACEUTF8 - ? isSPACE(nextchr) : isSPACE_LC(nextchr)) + if (!(OP(scan) == UPPERUTF8 + ? isUPPER(nextchr) : isUPPER_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); break; - case DIGIT: - if (!isDIGIT(nextchr)) + case NUPPERL: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NUPPER: + if (!nextchr) + sayNO; + if (OP(scan) == UPPER + ? isUPPER(nextchr) : isUPPER_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case DIGITUTF8: + case NUPPERLUTF8: + PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case NUPPERUTF8: + if (!nextchr && locinput >= PL_regeol) + sayNO; if (nextchr & 0x80) { - if (!(swash_fetch(PL_utf8_digit,(U8*)locinput))) + if (swash_fetch(PL_utf8_upper,(U8*)locinput)) sayNO; locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); break; } - if (!isDIGIT(nextchr)) + if (isUPPER(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NDIGIT: + case XDIGIT: if (!nextchr && locinput >= PL_regeol) sayNO; - if (isDIGIT(nextchr)) + if (!isXDIGIT(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; - case NDIGITUTF8: + case NXDIGIT: if (!nextchr && locinput >= PL_regeol) sayNO; - if (nextchr & 0x80) { - if (swash_fetch(PL_utf8_digit,(U8*)locinput)) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; - } - if (isDIGIT(nextchr)) + if (isXDIGIT(nextchr)) sayNO; nextchr = UCHARAT(++locinput); break; @@ -2920,11 +4322,11 @@ STATIC bool S_reginclass(pTHX_ register char *p, register I32 c) { dTHR; - char flags = *p; + char flags = ANYOF_FLAGS(p); bool match = FALSE; c &= 0xFF; - if (ANYOF_TEST(p, c)) + if (ANYOF_BITMAP_TEST(p, c)) match = TRUE; else if (flags & ANYOF_FOLD) { I32 cf; @@ -2934,17 +4336,40 @@ S_reginclass(pTHX_ register char *p, register I32 c) } else cf = PL_fold[c]; - if (ANYOF_TEST(p, cf)) + if (ANYOF_BITMAP_TEST(p, cf)) match = TRUE; } - if (!match && (flags & ANYOF_ISA)) { + if (!match && (flags & ANYOF_CLASS)) { PL_reg_flags |= RF_tainted; - - if (((flags & ANYOF_ALNUML) && isALNUM_LC(c)) || - ((flags & ANYOF_NALNUML) && !isALNUM_LC(c)) || - ((flags & ANYOF_SPACEL) && isSPACE_LC(c)) || - ((flags & ANYOF_NSPACEL) && !isSPACE_LC(c))) + if ( + (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) || + (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) + ) /* How's that for a conditional? */ { match = TRUE; } @@ -2976,17 +4401,7 @@ S_reginclassutf8(pTHX_ regnode *f, U8 *p) match = TRUE; } - if (!match && (flags & ANYOF_ISA)) { - PL_reg_flags |= RF_tainted; - - if (((flags & ANYOF_ALNUML) && isALNUM_LC_utf8(p)) || - ((flags & ANYOF_NALNUML) && !isALNUM_LC_utf8(p)) || - ((flags & ANYOF_SPACEL) && isSPACE_LC_utf8(p)) || - ((flags & ANYOF_NSPACEL) && !isSPACE_LC_utf8(p))) - { - match = TRUE; - } - } + /* UTF8 combined with ANYOF_CLASS is ill-defined. */ return (flags & ANYOF_INVERT) ? !match : match; } diff --git a/regnodes.h b/regnodes.h index 030fa1a..cdc6dd4 100644 --- a/regnodes.h +++ b/regnodes.h @@ -45,40 +45,112 @@ #define NSPACELUTF8 39 /* 0x27 Match any non-whitespace char in locale */ #define DIGIT 40 /* 0x28 Match any numeric character */ #define DIGITUTF8 41 /* 0x29 Match any numeric character */ -#define NDIGIT 42 /* 0x2a Match any non-numeric character */ -#define NDIGITUTF8 43 /* 0x2b Match any non-numeric character */ -#define CLUMP 44 /* 0x2c Match any combining character sequence */ -#define BRANCH 45 /* 0x2d Match this alternative, or the next... */ -#define BACK 46 /* 0x2e Match "", "next" ptr points backward. */ -#define EXACT 47 /* 0x2f Match this string (preceded by length). */ -#define EXACTF 48 /* 0x30 Match this string, folded (prec. by length). */ -#define EXACTFL 49 /* 0x31 Match this string, folded in locale (w/len). */ -#define NOTHING 50 /* 0x32 Match empty string. */ -#define TAIL 51 /* 0x33 Match empty string. Can jump here from outside. */ -#define STAR 52 /* 0x34 Match this (simple) thing 0 or more times. */ -#define PLUS 53 /* 0x35 Match this (simple) thing 1 or more times. */ -#define CURLY 54 /* 0x36 Match this simple thing {n,m} times. */ -#define CURLYN 55 /* 0x37 Match next-after-this simple thing */ -#define CURLYM 56 /* 0x38 Match this medium-complex thing {n,m} times. */ -#define CURLYX 57 /* 0x39 Match this complex thing {n,m} times. */ -#define WHILEM 58 /* 0x3a Do curly processing and see if rest matches. */ -#define OPEN 59 /* 0x3b Mark this point in input as start of #n. */ -#define CLOSE 60 /* 0x3c Analogous to OPEN. */ -#define REF 61 /* 0x3d Match some already matched string */ -#define REFF 62 /* 0x3e Match already matched string, folded */ -#define REFFL 63 /* 0x3f Match already matched string, folded in loc. */ -#define IFMATCH 64 /* 0x40 Succeeds if the following matches. */ -#define UNLESSM 65 /* 0x41 Fails if the following matches. */ -#define SUSPEND 66 /* 0x42 "Independent" sub-RE. */ -#define IFTHEN 67 /* 0x43 Switch, should be preceeded by switcher . */ -#define GROUPP 68 /* 0x44 Whether the group matched. */ -#define LONGJMP 69 /* 0x45 Jump far away. */ -#define BRANCHJ 70 /* 0x46 BRANCH with long offset. */ -#define EVAL 71 /* 0x47 Execute some Perl code. */ -#define MINMOD 72 /* 0x48 Next operator is not greedy. */ -#define LOGICAL 73 /* 0x49 Next opcode should set the flag only. */ -#define RENUM 74 /* 0x4a Group with independently numbered parens. */ -#define OPTIMIZED 75 /* 0x4b Placeholder for dump. */ +#define DIGITL 42 /* 0x2a Match any numeric character in locale */ +#define DIGITLUTF8 43 /* 0x2b Match any numeric character in locale */ +#define NDIGIT 44 /* 0x2c Match any non-numeric character */ +#define NDIGITUTF8 45 /* 0x2d Match any non-numeric character */ +#define NDIGITL 46 /* 0x2e Match any non-numeric character in locale */ +#define NDIGITLUTF8 47 /* 0x2f Match any non-numeric character in locale */ +#define ALNUMC 48 /* 0x30 Match any alphanumeric character */ +#define ALNUMCUTF8 49 /* 0x31 Match any alphanumeric character */ +#define ALNUMCL 50 /* 0x32 Match any alphanumeric character in locale */ +#define ALNUMCLUTF8 51 /* 0x33 Match any alphanumeric character in locale */ +#define NALNUMC 52 /* 0x34 Match any non-alphanumeric character */ +#define NALNUMCUTF8 53 /* 0x35 Match any non-alphanumeric character */ +#define NALNUMCL 54 /* 0x36 Match any non-alphanumeric character in locale */ +#define NALNUMCLUTF8 55 /* 0x37 Match any non-alphanumeric character in locale */ +#define ALPHA 56 /* 0x38 Match any alphabetic character */ +#define ALPHAUTF8 57 /* 0x39 Match any alphabetic character */ +#define ALPHAL 58 /* 0x3a Match any alphabetic character in locale */ +#define ALPHALUTF8 59 /* 0x3b Match any alphabetic character in locale */ +#define NALPHA 60 /* 0x3c Match any non-alphabetic character */ +#define NALPHAUTF8 61 /* 0x3d Match any non-alphabetic character */ +#define NALPHAL 62 /* 0x3e Match any non-alphabetic character in locale */ +#define NALPHALUTF8 63 /* 0x3f Match any non-alphabetic character in locale */ +#define ASCII 64 /* 0x40 Match any ASCII character */ +#define NASCII 65 /* 0x41 Match any non-ASCII character */ +#define CNTRL 66 /* 0x42 Match any control character */ +#define CNTRLUTF8 67 /* 0x43 Match any control character */ +#define CNTRLL 68 /* 0x44 Match any control character in locale */ +#define CNTRLLUTF8 69 /* 0x45 Match any control character in locale */ +#define NCNTRL 70 /* 0x46 Match any non-control character */ +#define NCNTRLUTF8 71 /* 0x47 Match any non-control character */ +#define NCNTRLL 72 /* 0x48 Match any non-control character in locale */ +#define NCNTRLLUTF8 73 /* 0x49 Match any non-control character in locale */ +#define GRAPH 74 /* 0x4a Match any graphical character */ +#define GRAPHUTF8 75 /* 0x4b Match any graphical character */ +#define GRAPHL 76 /* 0x4c Match any graphical character in locale */ +#define GRAPHLUTF8 77 /* 0x4d Match any graphical character in locale */ +#define NGRAPH 78 /* 0x4e Match any non-graphical character */ +#define NGRAPHUTF8 79 /* 0x4f Match any non-graphical character */ +#define NGRAPHL 80 /* 0x50 Match any non-graphical character in locale */ +#define NGRAPHLUTF8 81 /* 0x51 Match any non-graphical character in locale */ +#define LOWER 82 /* 0x52 Match any lowercase character */ +#define LOWERUTF8 83 /* 0x53 Match any lowercase character */ +#define LOWERL 84 /* 0x54 Match any lowercase character in locale */ +#define LOWERLUTF8 85 /* 0x55 Match any lowercase character in locale */ +#define NLOWER 86 /* 0x56 Match any non-lowercase character */ +#define NLOWERUTF8 87 /* 0x57 Match any non-lowercase character */ +#define NLOWERL 88 /* 0x58 Match any non-lowercase character in locale */ +#define NLOWERLUTF8 89 /* 0x59 Match any non-lowercase character in locale */ +#define PRINT 90 /* 0x5a Match any printable character */ +#define PRINTUTF8 91 /* 0x5b Match any printable character */ +#define PRINTL 92 /* 0x5c Match any printable character in locale */ +#define PRINTLUTF8 93 /* 0x5d Match any printable character in locale */ +#define NPRINT 94 /* 0x5e Match any non-printable character */ +#define NPRINTUTF8 95 /* 0x5f Match any non-printable character */ +#define NPRINTL 96 /* 0x60 Match any non-printable character in locale */ +#define NPRINTLUTF8 97 /* 0x61 Match any non-printable character in locale */ +#define PUNCT 98 /* 0x62 Match any punctuation character */ +#define PUNCTUTF8 99 /* 0x63 Match any punctuation character */ +#define PUNCTL 100 /* 0x64 Match any punctuation character in locale */ +#define PUNCTLUTF8 101 /* 0x65 Match any punctuation character in locale */ +#define NPUNCT 102 /* 0x66 Match any non-punctuation character */ +#define NPUNCTUTF8 103 /* 0x67 Match any non-punctuation character */ +#define NPUNCTL 104 /* 0x68 Match any non-punctuation character in locale */ +#define NPUNCTLUTF8 105 /* 0x69 Match any non-punctuation character in locale */ +#define UPPER 106 /* 0x6a Match any uppercase character */ +#define UPPERUTF8 107 /* 0x6b Match any uppercase character */ +#define UPPERL 108 /* 0x6c Match any uppercase character in locale */ +#define UPPERLUTF8 109 /* 0x6d Match any uppercase character in locale */ +#define NUPPER 110 /* 0x6e Match any non-uppercase character */ +#define NUPPERUTF8 111 /* 0x6f Match any non-uppercase character */ +#define NUPPERL 112 /* 0x70 Match any non-uppercase character in locale */ +#define NUPPERLUTF8 113 /* 0x71 Match any non-uppercase character in locale */ +#define XDIGIT 114 /* 0x72 Match any hexdigit character */ +#define NXDIGIT 115 /* 0x73 Match any non-hexdigit character */ +#define CLUMP 116 /* 0x74 Match any combining character sequence */ +#define BRANCH 117 /* 0x75 Match this alternative, or the next... */ +#define BACK 118 /* 0x76 Match "", "next" ptr points backward. */ +#define EXACT 119 /* 0x77 Match this string (preceded by length). */ +#define EXACTF 120 /* 0x78 Match this string, folded (prec. by length). */ +#define EXACTFL 121 /* 0x79 Match this string, folded in locale (w/len). */ +#define NOTHING 122 /* 0x7a Match empty string. */ +#define TAIL 123 /* 0x7b Match empty string. Can jump here from outside. */ +#define STAR 124 /* 0x7c Match this (simple) thing 0 or more times. */ +#define PLUS 125 /* 0x7d Match this (simple) thing 1 or more times. */ +#define CURLY 126 /* 0x7e Match this simple thing {n,m} times. */ +#define CURLYN 127 /* 0x7f Match next-after-this simple thing */ +#define CURLYM 128 /* 0x80 Match this medium-complex thing {n,m} times. */ +#define CURLYX 129 /* 0x81 Match this complex thing {n,m} times. */ +#define WHILEM 130 /* 0x82 Do curly processing and see if rest matches. */ +#define OPEN 131 /* 0x83 Mark this point in input as start of #n. */ +#define CLOSE 132 /* 0x84 Analogous to OPEN. */ +#define REF 133 /* 0x85 Match some already matched string */ +#define REFF 134 /* 0x86 Match already matched string, folded */ +#define REFFL 135 /* 0x87 Match already matched string, folded in loc. */ +#define IFMATCH 136 /* 0x88 Succeeds if the following matches. */ +#define UNLESSM 137 /* 0x89 Fails if the following matches. */ +#define SUSPEND 138 /* 0x8a "Independent" sub-RE. */ +#define IFTHEN 139 /* 0x8b Switch, should be preceeded by switcher . */ +#define GROUPP 140 /* 0x8c Whether the group matched. */ +#define LONGJMP 141 /* 0x8d Jump far away. */ +#define BRANCHJ 142 /* 0x8e BRANCH with long offset. */ +#define EVAL 143 /* 0x8f Execute some Perl code. */ +#define MINMOD 144 /* 0x90 Next operator is not greedy. */ +#define LOGICAL 145 /* 0x91 Next opcode should set the flag only. */ +#define RENUM 146 /* 0x92 Group with independently numbered parens. */ +#define OPTIMIZED 147 /* 0x93 Placeholder for dump. */ #ifndef DOINIT EXTCONST U8 PL_regkind[]; @@ -126,8 +198,80 @@ EXTCONST U8 PL_regkind[] = { NSPACE, /* NSPACELUTF8 */ DIGIT, /* DIGIT */ DIGIT, /* DIGITUTF8 */ + DIGIT, /* DIGITL */ + DIGIT, /* DIGITLUTF8 */ NDIGIT, /* NDIGIT */ NDIGIT, /* NDIGITUTF8 */ + NDIGIT, /* NDIGITL */ + NDIGIT, /* NDIGITLUTF8 */ + ALNUMC, /* ALNUMC */ + ALNUMC, /* ALNUMCUTF8 */ + ALNUMC, /* ALNUMCL */ + ALNUMC, /* ALNUMCLUTF8 */ + NALNUMC, /* NALNUMC */ + NALNUMC, /* NALNUMCUTF8 */ + NALNUMC, /* NALNUMCL */ + NALNUMC, /* NALNUMCLUTF8 */ + ALPHA, /* ALPHA */ + ALPHA, /* ALPHAUTF8 */ + ALPHA, /* ALPHAL */ + ALPHA, /* ALPHALUTF8 */ + NALPHA, /* NALPHA */ + NALPHA, /* NALPHAUTF8 */ + NALPHA, /* NALPHAL */ + NALPHA, /* NALPHALUTF8 */ + ASCII, /* ASCII */ + NASCII, /* NASCII */ + CNTRL, /* CNTRL */ + CNTRL, /* CNTRLUTF8 */ + CNTRL, /* CNTRLL */ + CNTRL, /* CNTRLLUTF8 */ + NCNTRL, /* NCNTRL */ + NCNTRL, /* NCNTRLUTF8 */ + NCNTRL, /* NCNTRLL */ + NCNTRL, /* NCNTRLLUTF8 */ + GRAPH, /* GRAPH */ + GRAPH, /* GRAPHUTF8 */ + GRAPH, /* GRAPHL */ + GRAPH, /* GRAPHLUTF8 */ + NGRAPH, /* NGRAPH */ + NGRAPH, /* NGRAPHUTF8 */ + NGRAPH, /* NGRAPHL */ + NGRAPH, /* NGRAPHLUTF8 */ + LOWER, /* LOWER */ + LOWER, /* LOWERUTF8 */ + LOWER, /* LOWERL */ + LOWER, /* LOWERLUTF8 */ + NLOWER, /* NLOWER */ + NLOWER, /* NLOWERUTF8 */ + NLOWER, /* NLOWERL */ + NLOWER, /* NLOWERLUTF8 */ + PRINT, /* PRINT */ + PRINT, /* PRINTUTF8 */ + PRINT, /* PRINTL */ + PRINT, /* PRINTLUTF8 */ + NPRINT, /* NPRINT */ + NPRINT, /* NPRINTUTF8 */ + NPRINT, /* NPRINTL */ + NPRINT, /* NPRINTLUTF8 */ + PUNCT, /* PUNCT */ + PUNCT, /* PUNCTUTF8 */ + PUNCT, /* PUNCTL */ + PUNCT, /* PUNCTLUTF8 */ + NPUNCT, /* NPUNCT */ + NPUNCT, /* NPUNCTUTF8 */ + NPUNCT, /* NPUNCTL */ + NPUNCT, /* NPUNCTLUTF8 */ + UPPER, /* UPPER */ + UPPER, /* UPPERUTF8 */ + UPPER, /* UPPERL */ + UPPER, /* UPPERLUTF8 */ + NUPPER, /* NUPPER */ + NUPPER, /* NUPPERUTF8 */ + NUPPER, /* NUPPERL */ + NUPPER, /* NUPPERLUTF8 */ + XDIGIT, /* XDIGIT */ + NXDIGIT, /* NXDIGIT */ CLUMP, /* CLUMP */ BRANCH, /* BRANCH */ BACK, /* BACK */ @@ -208,8 +352,80 @@ const static U8 regarglen[] = { 0, /* NSPACELUTF8 */ 0, /* DIGIT */ 0, /* DIGITUTF8 */ + 0, /* DIGITL */ + 0, /* DIGITLUTF8 */ 0, /* NDIGIT */ 0, /* NDIGITUTF8 */ + 0, /* NDIGITL */ + 0, /* NDIGITLUTF8 */ + 0, /* ALNUMC */ + 0, /* ALNUMCUTF8 */ + 0, /* ALNUMCL */ + 0, /* ALNUMCLUTF8 */ + 0, /* NALNUMC */ + 0, /* NALNUMCUTF8 */ + 0, /* NALNUMCL */ + 0, /* NALNUMCLUTF8 */ + 0, /* ALPHA */ + 0, /* ALPHAUTF8 */ + 0, /* ALPHAL */ + 0, /* ALPHALUTF8 */ + 0, /* NALPHA */ + 0, /* NALPHAUTF8 */ + 0, /* NALPHAL */ + 0, /* NALPHALUTF8 */ + 0, /* ASCII */ + 0, /* NASCII */ + 0, /* CNTRL */ + 0, /* CNTRLUTF8 */ + 0, /* CNTRLL */ + 0, /* CNTRLLUTF8 */ + 0, /* NCNTRL */ + 0, /* NCNTRLUTF8 */ + 0, /* NCNTRLL */ + 0, /* NCNTRLLUTF8 */ + 0, /* GRAPH */ + 0, /* GRAPHUTF8 */ + 0, /* GRAPHL */ + 0, /* GRAPHLUTF8 */ + 0, /* NGRAPH */ + 0, /* NGRAPHUTF8 */ + 0, /* NGRAPHL */ + 0, /* NGRAPHLUTF8 */ + 0, /* LOWER */ + 0, /* LOWERUTF8 */ + 0, /* LOWERL */ + 0, /* LOWERLUTF8 */ + 0, /* NLOWER */ + 0, /* NLOWERUTF8 */ + 0, /* NLOWERL */ + 0, /* NLOWERLUTF8 */ + 0, /* PRINT */ + 0, /* PRINTUTF8 */ + 0, /* PRINTL */ + 0, /* PRINTLUTF8 */ + 0, /* NPRINT */ + 0, /* NPRINTUTF8 */ + 0, /* NPRINTL */ + 0, /* NPRINTLUTF8 */ + 0, /* PUNCT */ + 0, /* PUNCTUTF8 */ + 0, /* PUNCTL */ + 0, /* PUNCTLUTF8 */ + 0, /* NPUNCT */ + 0, /* NPUNCTUTF8 */ + 0, /* NPUNCTL */ + 0, /* NPUNCTLUTF8 */ + 0, /* UPPER */ + 0, /* UPPERUTF8 */ + 0, /* UPPERL */ + 0, /* UPPERLUTF8 */ + 0, /* NUPPER */ + 0, /* NUPPERUTF8 */ + 0, /* NUPPERL */ + 0, /* NUPPERLUTF8 */ + 0, /* XDIGIT */ + 0, /* NXDIGIT */ 0, /* CLUMP */ 0, /* BRANCH */ 0, /* BACK */ @@ -287,8 +503,80 @@ const static char reg_off_by_arg[] = { 0, /* NSPACELUTF8 */ 0, /* DIGIT */ 0, /* DIGITUTF8 */ + 0, /* DIGITL */ + 0, /* DIGITLUTF8 */ 0, /* NDIGIT */ 0, /* NDIGITUTF8 */ + 0, /* NDIGITL */ + 0, /* NDIGITLUTF8 */ + 0, /* ALNUMC */ + 0, /* ALNUMCUTF8 */ + 0, /* ALNUMCL */ + 0, /* ALNUMCLUTF8 */ + 0, /* NALNUMC */ + 0, /* NALNUMCUTF8 */ + 0, /* NALNUMCL */ + 0, /* NALNUMCLUTF8 */ + 0, /* ALPHA */ + 0, /* ALPHAUTF8 */ + 0, /* ALPHAL */ + 0, /* ALPHALUTF8 */ + 0, /* NALPHA */ + 0, /* NALPHAUTF8 */ + 0, /* NALPHAL */ + 0, /* NALPHALUTF8 */ + 0, /* ASCII */ + 0, /* NASCII */ + 0, /* CNTRL */ + 0, /* CNTRLUTF8 */ + 0, /* CNTRLL */ + 0, /* CNTRLLUTF8 */ + 0, /* NCNTRL */ + 0, /* NCNTRLUTF8 */ + 0, /* NCNTRLL */ + 0, /* NCNTRLLUTF8 */ + 0, /* GRAPH */ + 0, /* GRAPHUTF8 */ + 0, /* GRAPHL */ + 0, /* GRAPHLUTF8 */ + 0, /* NGRAPH */ + 0, /* NGRAPHUTF8 */ + 0, /* NGRAPHL */ + 0, /* NGRAPHLUTF8 */ + 0, /* LOWER */ + 0, /* LOWERUTF8 */ + 0, /* LOWERL */ + 0, /* LOWERLUTF8 */ + 0, /* NLOWER */ + 0, /* NLOWERUTF8 */ + 0, /* NLOWERL */ + 0, /* NLOWERLUTF8 */ + 0, /* PRINT */ + 0, /* PRINTUTF8 */ + 0, /* PRINTL */ + 0, /* PRINTLUTF8 */ + 0, /* NPRINT */ + 0, /* NPRINTUTF8 */ + 0, /* NPRINTL */ + 0, /* NPRINTLUTF8 */ + 0, /* PUNCT */ + 0, /* PUNCTUTF8 */ + 0, /* PUNCTL */ + 0, /* PUNCTLUTF8 */ + 0, /* NPUNCT */ + 0, /* NPUNCTUTF8 */ + 0, /* NPUNCTL */ + 0, /* NPUNCTLUTF8 */ + 0, /* UPPER */ + 0, /* UPPERUTF8 */ + 0, /* UPPERL */ + 0, /* UPPERLUTF8 */ + 0, /* NUPPER */ + 0, /* NUPPERUTF8 */ + 0, /* NUPPERL */ + 0, /* NUPPERLUTF8 */ + 0, /* XDIGIT */ + 0, /* NXDIGIT */ 0, /* CLUMP */ 0, /* BRANCH */ 0, /* BACK */ @@ -367,43 +655,115 @@ const static char * const reg_name[] = { "NSPACELUTF8", /* 0x27 */ "DIGIT", /* 0x28 */ "DIGITUTF8", /* 0x29 */ - "NDIGIT", /* 0x2a */ - "NDIGITUTF8", /* 0x2b */ - "CLUMP", /* 0x2c */ - "BRANCH", /* 0x2d */ - "BACK", /* 0x2e */ - "EXACT", /* 0x2f */ - "EXACTF", /* 0x30 */ - "EXACTFL", /* 0x31 */ - "NOTHING", /* 0x32 */ - "TAIL", /* 0x33 */ - "STAR", /* 0x34 */ - "PLUS", /* 0x35 */ - "CURLY", /* 0x36 */ - "CURLYN", /* 0x37 */ - "CURLYM", /* 0x38 */ - "CURLYX", /* 0x39 */ - "WHILEM", /* 0x3a */ - "OPEN", /* 0x3b */ - "CLOSE", /* 0x3c */ - "REF", /* 0x3d */ - "REFF", /* 0x3e */ - "REFFL", /* 0x3f */ - "IFMATCH", /* 0x40 */ - "UNLESSM", /* 0x41 */ - "SUSPEND", /* 0x42 */ - "IFTHEN", /* 0x43 */ - "GROUPP", /* 0x44 */ - "LONGJMP", /* 0x45 */ - "BRANCHJ", /* 0x46 */ - "EVAL", /* 0x47 */ - "MINMOD", /* 0x48 */ - "LOGICAL", /* 0x49 */ - "RENUM", /* 0x4a */ - "OPTIMIZED", /* 0x4b */ + "DIGITL", /* 0x2a */ + "DIGITLUTF8", /* 0x2b */ + "NDIGIT", /* 0x2c */ + "NDIGITUTF8", /* 0x2d */ + "NDIGITL", /* 0x2e */ + "NDIGITLUTF8", /* 0x2f */ + "ALNUMC", /* 0x30 */ + "ALNUMCUTF8", /* 0x31 */ + "ALNUMCL", /* 0x32 */ + "ALNUMCLUTF8", /* 0x33 */ + "NALNUMC", /* 0x34 */ + "NALNUMCUTF8", /* 0x35 */ + "NALNUMCL", /* 0x36 */ + "NALNUMCLUTF8", /* 0x37 */ + "ALPHA", /* 0x38 */ + "ALPHAUTF8", /* 0x39 */ + "ALPHAL", /* 0x3a */ + "ALPHALUTF8", /* 0x3b */ + "NALPHA", /* 0x3c */ + "NALPHAUTF8", /* 0x3d */ + "NALPHAL", /* 0x3e */ + "NALPHALUTF8", /* 0x3f */ + "ASCII", /* 0x40 */ + "NASCII", /* 0x41 */ + "CNTRL", /* 0x42 */ + "CNTRLUTF8", /* 0x43 */ + "CNTRLL", /* 0x44 */ + "CNTRLLUTF8", /* 0x45 */ + "NCNTRL", /* 0x46 */ + "NCNTRLUTF8", /* 0x47 */ + "NCNTRLL", /* 0x48 */ + "NCNTRLLUTF8", /* 0x49 */ + "GRAPH", /* 0x4a */ + "GRAPHUTF8", /* 0x4b */ + "GRAPHL", /* 0x4c */ + "GRAPHLUTF8", /* 0x4d */ + "NGRAPH", /* 0x4e */ + "NGRAPHUTF8", /* 0x4f */ + "NGRAPHL", /* 0x50 */ + "NGRAPHLUTF8", /* 0x51 */ + "LOWER", /* 0x52 */ + "LOWERUTF8", /* 0x53 */ + "LOWERL", /* 0x54 */ + "LOWERLUTF8", /* 0x55 */ + "NLOWER", /* 0x56 */ + "NLOWERUTF8", /* 0x57 */ + "NLOWERL", /* 0x58 */ + "NLOWERLUTF8", /* 0x59 */ + "PRINT", /* 0x5a */ + "PRINTUTF8", /* 0x5b */ + "PRINTL", /* 0x5c */ + "PRINTLUTF8", /* 0x5d */ + "NPRINT", /* 0x5e */ + "NPRINTUTF8", /* 0x5f */ + "NPRINTL", /* 0x60 */ + "NPRINTLUTF8", /* 0x61 */ + "PUNCT", /* 0x62 */ + "PUNCTUTF8", /* 0x63 */ + "PUNCTL", /* 0x64 */ + "PUNCTLUTF8", /* 0x65 */ + "NPUNCT", /* 0x66 */ + "NPUNCTUTF8", /* 0x67 */ + "NPUNCTL", /* 0x68 */ + "NPUNCTLUTF8", /* 0x69 */ + "UPPER", /* 0x6a */ + "UPPERUTF8", /* 0x6b */ + "UPPERL", /* 0x6c */ + "UPPERLUTF8", /* 0x6d */ + "NUPPER", /* 0x6e */ + "NUPPERUTF8", /* 0x6f */ + "NUPPERL", /* 0x70 */ + "NUPPERLUTF8", /* 0x71 */ + "XDIGIT", /* 0x72 */ + "NXDIGIT", /* 0x73 */ + "CLUMP", /* 0x74 */ + "BRANCH", /* 0x75 */ + "BACK", /* 0x76 */ + "EXACT", /* 0x77 */ + "EXACTF", /* 0x78 */ + "EXACTFL", /* 0x79 */ + "NOTHING", /* 0x7a */ + "TAIL", /* 0x7b */ + "STAR", /* 0x7c */ + "PLUS", /* 0x7d */ + "CURLY", /* 0x7e */ + "CURLYN", /* 0x7f */ + "CURLYM", /* 0x80 */ + "CURLYX", /* 0x81 */ + "WHILEM", /* 0x82 */ + "OPEN", /* 0x83 */ + "CLOSE", /* 0x84 */ + "REF", /* 0x85 */ + "REFF", /* 0x86 */ + "REFFL", /* 0x87 */ + "IFMATCH", /* 0x88 */ + "UNLESSM", /* 0x89 */ + "SUSPEND", /* 0x8a */ + "IFTHEN", /* 0x8b */ + "GROUPP", /* 0x8c */ + "LONGJMP", /* 0x8d */ + "BRANCHJ", /* 0x8e */ + "EVAL", /* 0x8f */ + "MINMOD", /* 0x90 */ + "LOGICAL", /* 0x91 */ + "RENUM", /* 0x92 */ + "OPTIMIZED", /* 0x93 */ }; -const static int reg_num = 76; +const static int reg_num = 148; #endif /* DEBUGGING */ #endif /* REG_COMP_C */ diff --git a/t/op/pat.t b/t/op/pat.t index a086c12..6312c75 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -282,14 +282,7 @@ eval qq("${context}y" =~ /(?<=$context)y/); print "not " if $@ !~ m%^\Q/(?<=\Ex+/: lookbehind longer than 255 not%; print "ok 71\n"; -# This one will fail when POSIX character classes do get implemented -{ - my $w; - local $^W = 1; - local $SIG{__WARN__} = sub{$w = shift}; - eval q('a' =~ /[[:alpha:]]/); - print "not " if $w !~ /^\QCharacter class syntax [: :] is reserved/; -} +# removed test print "ok 72\n"; # Long Monsters diff --git a/t/op/re_tests b/t/op/re_tests index 466fc85..cbcb725 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -474,9 +474,37 @@ $(?<=^(a)) a y $1 a ([[=]+) a=[b]= y $1 =[ ([[.]+) a.[b]. y $1 .[ [a[:xyz: - c - /[a[:xyz:/: unmatched [] in regexp -[a[:xyz:] - c - /[a[:xyz:]/: unmatched [] in regexp +[a[:xyz:] - c - Character class [:xyz:] unknown [a[:]b[:c] abc y $& abc -([a[:xyz:]b]+) pbaq y $1 ba +([a[:xyz:]b]+) pbaq c - Character class [:xyz:] unknown +[a[:]b[:c] abc y $& abc +([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd +([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy +([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul} +([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul} +([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01 +([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd +([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- +([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 +([[:word:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__ +([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB +([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01 +([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01 +([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff} +([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff} +([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd +([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB +([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff} +([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy +([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- +([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 -- ${nulnul}${ffff} +([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01 +([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff} +[[:foo:]] - c - Character class [:foo:] unknown +[[:^foo:]] - c - Character class [:^foo:] unknown ((?>a+)b) aaab y $1 aaab (?>(a+))b aaab y $1 aaa ((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x y $& abc(ade)ufh()()x diff --git a/t/op/regexp.t b/t/op/regexp.t index 66b2d1c..4ffe136 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -47,6 +47,8 @@ seek(TESTS,0,0); $. = 0; $bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. +$ffff = chr(0xff) x 2; +$nulnul = "\0" x 2; $| = 1; print "1..$numtests\n# $iters iterations\n"; @@ -59,12 +61,16 @@ while () { infty_subst(\$pat); infty_subst(\$expect); $pat = "'$pat'" unless $pat =~ /^[:']/; - $pat =~ s/\\n/\n/g; $pat =~ s/(\$\{\w+\})/$1/eeg; + $pat =~ s/\\n/\n/g; + $subject =~ s/(\$\{\w+\})/$1/eeg; $subject =~ s/\\n/\n/g; + $expect =~ s/(\$\{\w+\})/$1/eeg; $expect =~ s/\\n/\n/g; $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); + # Certain tests don't work with utf8 (the re_test should be in UTF8) + $skip = 1 if ($^H &= ~0x00000008) && $pat =~ /\[:\^(alnum|print|word):\]/; $result =~ s/B//i unless $skip; for $study ('', 'study \$subject') { $c = $iters; @@ -75,7 +81,7 @@ while () { last; # no need to study a syntax error } elsif ( $skip ) { - print "ok $. # Skipped: not fixed yet\n"; next TEST; + print "ok $. # skipped\n"; next TEST; } elsif ($@) { print "not ok $. $input => error `$err'\n"; next TEST; diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index e5b3bb5..5e467ae 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -6,7 +6,7 @@ BEGIN { $ENV{PERL5LIB} = '../lib'; } -print "1..3\n"; +print "1..9\n"; my $test = 1; @@ -34,4 +34,37 @@ sub ok { s/([$rx])/"&#".ord($1).";"/eg; ok $_, '>☺<'; $test++; + + $_ = "alpha,numeric"; + m/([[:alpha:]]+)/; + ok $1, 'alpha'; + $test++; + + $_ = "alphaNUMERICstring"; + m/([[:^lower:]]+)/; + ok $1, 'NUMERIC'; + $test++; + + $_ = "alphaNUMERICstring"; + m/(\p{Ll}+)/; + ok $1, 'alpha'; + $test++; + + $_ = "alphaNUMERICstring"; + m/(\p{Lu}+)/; + ok $1, 'NUMERIC'; + $test++; + + $_ = "alpha,numeric"; + m/([\p{IsAlpha}]+)/; + ok $1, 'alpha'; + $test++; + + $_ = "alphaNUMERICstring"; + m/([^\p{IsLower}]+)/; + ok $1, 'NUMERIC'; + $test++; + } + + diff --git a/t/pragma/warn/regcomp b/t/pragma/warn/regcomp index 52a163a..0f48c67 100644 --- a/t/pragma/warn/regcomp +++ b/t/pragma/warn/regcomp @@ -8,9 +8,6 @@ /(?=a)?/ - Character class syntax [: :] is reserved for future extensions - /[a[:xyz:]b]/ - Character class syntax [. .] is reserved for future extensions Character class syntax [= =] is reserved for future extensions @@ -32,22 +29,21 @@ Strange *+?{} on zero-length expression at - line 4. # regcomp.c use warning 'unsafe' ; $_ = "" ; -/[a[:xyz:]b]/; /[a[.xyz.]b]/; /[a[=xyz=]b]/; EXPECT -Character class syntax [: :] is reserved for future extensions at - line 4. -Character class syntax [. .] is reserved for future extensions at - line 5. -Character class syntax [= =] is reserved for future extensions at - line 6. +Character class syntax [. .] is reserved for future extensions at - line 4. +Character class syntax [= =] is reserved for future extensions at - line 5. ######## # regcomp.c -use warning 'unsafe' ; -# use utf8 ; # Note this line should be uncommented when utf8 gets fixed. +use warning 'unsafe' ; $_ = "" ; -/[a[:xyz:]b]/; -/[a[.xyz.]b]/; -/[a[=xyz=]b]/; +/[:foo:]/; +/[.bar.]/; +/[=zog=]/; EXPECT -Character class syntax [: :] is reserved for future extensions at - line 5. -Character class syntax [. .] is reserved for future extensions at - line 6. -Character class syntax [= =] is reserved for future extensions at - line 7. +Character class syntax [: :] belongs inside character classes at - line 4. +Character class syntax [. .] belongs inside character classes at - line 5. +Character class syntax [. .] is reserved for future extensions at - line 5. +Character class syntax [= =] belongs inside character classes at - line 6. +Character class syntax [= =] is reserved for future extensions at - line 6. diff --git a/utf8.c b/utf8.c index 8c7aee2..0e52f21 100644 --- a/utf8.c +++ b/utf8.c @@ -255,6 +255,14 @@ Perl_is_uni_alnum(pTHX_ U32 c) } bool +Perl_is_uni_alnumc(pTHX_ U32 c) +{ + U8 tmpbuf[10]; + uv_to_utf8(tmpbuf, (UV)c); + return is_utf8_alnumc(tmpbuf); +} + +bool Perl_is_uni_idfirst(pTHX_ U32 c) { U8 tmpbuf[10]; @@ -303,6 +311,22 @@ Perl_is_uni_lower(pTHX_ U32 c) } bool +Perl_is_uni_cntrl(pTHX_ U32 c) +{ + U8 tmpbuf[10]; + uv_to_utf8(tmpbuf, (UV)c); + return is_utf8_cntrl(tmpbuf); +} + +bool +Perl_is_uni_graph(pTHX_ U32 c) +{ + U8 tmpbuf[10]; + uv_to_utf8(tmpbuf, (UV)c); + return is_utf8_graph(tmpbuf); +} + +bool Perl_is_uni_print(pTHX_ U32 c) { U8 tmpbuf[10]; @@ -310,6 +334,14 @@ Perl_is_uni_print(pTHX_ U32 c) return is_utf8_print(tmpbuf); } +bool +is_uni_punct(U32 c) +{ + U8 tmpbuf[10]; + uv_to_utf8(tmpbuf, (UV)c); + return is_utf8_punct(tmpbuf); +} + U32 Perl_to_uni_upper(pTHX_ U32 c) { @@ -343,6 +375,12 @@ Perl_is_uni_alnum_lc(pTHX_ U32 c) } bool +Perl_is_uni_alnumc_lc(pTHX_ U32 c) +{ + return is_uni_alnumc(c); /* XXX no locale support yet */ +} + +bool Perl_is_uni_idfirst_lc(pTHX_ U32 c) { return is_uni_idfirst(c); /* XXX no locale support yet */ @@ -379,11 +417,29 @@ Perl_is_uni_lower_lc(pTHX_ U32 c) } bool +Perl_is_uni_cntrl_lc(pTHX_ U32 c) +{ + return is_uni_cntrl(c); /* XXX no locale support yet */ +} + +bool +Perl_is_uni_graph_lc(pTHX_ U32 c) +{ + return is_uni_graph(c); /* XXX no locale support yet */ +} + +bool Perl_is_uni_print_lc(pTHX_ U32 c) { return is_uni_print(c); /* XXX no locale support yet */ } +bool +Perl_is_uni_punct_lc(pTHX_ U32 c) +{ + return is_uni_punct(c); /* XXX no locale support yet */ +} + U32 Perl_to_uni_upper_lc(pTHX_ U32 c) { @@ -402,7 +458,6 @@ Perl_to_uni_lower_lc(pTHX_ U32 c) return to_uni_lower(c); /* XXX no locale support yet */ } - bool Perl_is_utf8_alnum(pTHX_ U8 *p) { @@ -419,6 +474,21 @@ Perl_is_utf8_alnum(pTHX_ U8 *p) } bool +Perl_is_utf8_alnumc(pTHX_ U8 *p) +{ + if (!PL_utf8_alnum) + PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_alnum, p); +/* return is_utf8_alpha(p) || is_utf8_digit(p); */ +#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */ + if (!PL_utf8_alnum) + PL_utf8_alnum = swash_init("utf8", "", + sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0); + return swash_fetch(PL_utf8_alnum, p); +#endif +} + +bool Perl_is_utf8_idfirst(pTHX_ U8 *p) { return *p == '_' || is_utf8_alpha(p); @@ -433,6 +503,14 @@ Perl_is_utf8_alpha(pTHX_ U8 *p) } bool +Perl_is_utf8_ascii(pTHX_ U8 *p) +{ + if (!PL_utf8_ascii) + PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_ascii, p); +} + +bool Perl_is_utf8_space(pTHX_ U8 *p) { if (!PL_utf8_space) @@ -465,6 +543,22 @@ Perl_is_utf8_lower(pTHX_ U8 *p) } bool +Perl_is_utf8_cntrl(pTHX_ U8 *p) +{ + if (!PL_utf8_cntrl) + PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_cntrl, p); +} + +bool +Perl_is_utf8_graph(pTHX_ U8 *p) +{ + if (!PL_utf8_graph) + PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_graph, p); +} + +bool Perl_is_utf8_print(pTHX_ U8 *p) { if (!PL_utf8_print) @@ -473,6 +567,22 @@ Perl_is_utf8_print(pTHX_ U8 *p) } bool +Perl_is_utf8_punct(pTHX_ U8 *p) +{ + if (!PL_utf8_punct) + PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_punct, p); +} + +bool +Perl_is_utf8_xdigit(pTHX_ U8 *p) +{ + if (!PL_utf8_xdigit) + PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_xdigit, p); +} + +bool Perl_is_utf8_mark(pTHX_ U8 *p) { if (!PL_utf8_mark)