add new files to MANIFEST; add missing prototypes to proto.h;
Gurusamy Sarathy [Sun, 26 Jul 1998 05:07:05 +0000 (05:07 +0000)]
s/PL_utf8skip/utf8skip/ for now, or we end up with Perl_PL_;
add typecasts to silence warnings; tweaks for win32 builds

p4raw-id: //depot/perl@1663

MANIFEST
embed.h
global.sym
proto.h
regexec.c
toke.c
utf8.h
win32/Makefile
win32/makefile.mk

index 056e369..192caef 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -589,6 +589,165 @@ lib/syslog.pl             Perl library supporting syslogging
 lib/tainted.pl         Old code for tainting
 lib/termcap.pl         Perl library supporting termcap usage
 lib/timelocal.pl       Perl library supporting inverse of localtime, gmtime
+lib/unicode/ArabLink.pl                                Unicode character database
+lib/unicode/ArabLnkGrp.pl                      Unicode character database
+lib/unicode/Bidirectional.pl                   Unicode character database
+lib/unicode/Block.pl                           Unicode character database
+lib/unicode/Category.pl                                Unicode character database
+lib/unicode/CombiningClass.pl                  Unicode character database
+lib/unicode/Decomposition.pl                   Unicode character database
+lib/unicode/In/AlphabeticPresentationForms.pl  Unicode character database
+lib/unicode/In/Arabic.pl                       Unicode character database
+lib/unicode/In/ArabicPresentationForms-A.pl    Unicode character database
+lib/unicode/In/ArabicPresentationForms-B.pl    Unicode character database
+lib/unicode/In/Armenian.pl                     Unicode character database
+lib/unicode/In/Arrows.pl                       Unicode character database
+lib/unicode/In/BasicLatin.pl                   Unicode character database
+lib/unicode/In/Bengali.pl                      Unicode character database
+lib/unicode/In/BlockElements.pl                        Unicode character database
+lib/unicode/In/Bopomofo.pl                     Unicode character database
+lib/unicode/In/BoxDrawing.pl                   Unicode character database
+lib/unicode/In/CJKCompatibility.pl             Unicode character database
+lib/unicode/In/CJKCompatibilityForms.pl                Unicode character database
+lib/unicode/In/CJKCompatibilityIdeographs.pl   Unicode character database
+lib/unicode/In/CJKSymbolsandPunctuation.pl     Unicode character database
+lib/unicode/In/CJKUnifiedIdeographs.pl         Unicode character database
+lib/unicode/In/CombiningDiacriticalMarks.pl    Unicode character database
+lib/unicode/In/CombiningHalfMarks.pl           Unicode character database
+lib/unicode/In/CombiningMarksforSymbols.pl     Unicode character database
+lib/unicode/In/ControlPictures.pl              Unicode character database
+lib/unicode/In/CurrencySymbols.pl              Unicode character database
+lib/unicode/In/Cyrillic.pl                     Unicode character database
+lib/unicode/In/Devanagari.pl                   Unicode character database
+lib/unicode/In/Dingbats.pl                     Unicode character database
+lib/unicode/In/EnclosedAlphanumerics.pl                Unicode character database
+lib/unicode/In/EnclosedCJKLettersandMonths.pl  Unicode character database
+lib/unicode/In/GeneralPunctuation.pl           Unicode character database
+lib/unicode/In/GeometricShapes.pl              Unicode character database
+lib/unicode/In/Georgian.pl                     Unicode character database
+lib/unicode/In/Greek.pl                                Unicode character database
+lib/unicode/In/GreekExtended.pl                        Unicode character database
+lib/unicode/In/Gujarati.pl                     Unicode character database
+lib/unicode/In/Gurmukhi.pl                     Unicode character database
+lib/unicode/In/HalfwidthandFullwidthForms.pl   Unicode character database
+lib/unicode/In/HangulCompatibilityJamo.pl      Unicode character database
+lib/unicode/In/HangulJamo.pl                   Unicode character database
+lib/unicode/In/HangulSyllables.pl              Unicode character database
+lib/unicode/In/Hebrew.pl                       Unicode character database
+lib/unicode/In/HighPrivateUseSurrogates.pl     Unicode character database
+lib/unicode/In/HighSurrogates.pl               Unicode character database
+lib/unicode/In/Hiragana.pl                     Unicode character database
+lib/unicode/In/IPAExtensions.pl                        Unicode character database
+lib/unicode/In/Kanbun.pl                       Unicode character database
+lib/unicode/In/Kannada.pl                      Unicode character database
+lib/unicode/In/Katakana.pl                     Unicode character database
+lib/unicode/In/Lao.pl                          Unicode character database
+lib/unicode/In/Latin-1Supplement.pl            Unicode character database
+lib/unicode/In/LatinExtended-A.pl              Unicode character database
+lib/unicode/In/LatinExtended-B.pl              Unicode character database
+lib/unicode/In/LatinExtendedAdditional.pl      Unicode character database
+lib/unicode/In/LetterlikeSymbols.pl            Unicode character database
+lib/unicode/In/LowSurrogates.pl                        Unicode character database
+lib/unicode/In/Malayalam.pl                    Unicode character database
+lib/unicode/In/MathematicalOperators.pl                Unicode character database
+lib/unicode/In/MiscellaneousSymbols.pl         Unicode character database
+lib/unicode/In/MiscellaneousTechnical.pl       Unicode character database
+lib/unicode/In/NumberForms.pl                  Unicode character database
+lib/unicode/In/OpticalCharacterRecognition.pl  Unicode character database
+lib/unicode/In/Oriya.pl                                Unicode character database
+lib/unicode/In/PrivateUse.pl                   Unicode character database
+lib/unicode/In/SmallFormVariants.pl            Unicode character database
+lib/unicode/In/SpacingModifierLetters.pl       Unicode character database
+lib/unicode/In/Specials.pl                     Unicode character database
+lib/unicode/In/SuperscriptsandSubscripts.pl    Unicode character database
+lib/unicode/In/Tamil.pl                                Unicode character database
+lib/unicode/In/Telugu.pl                       Unicode character database
+lib/unicode/In/Thai.pl                         Unicode character database
+lib/unicode/In/Tibetan.pl                      Unicode character database
+lib/unicode/Is/Alnum.pl                                Unicode character database
+lib/unicode/Is/Alpha.pl                                Unicode character database
+lib/unicode/Is/BidiAN.pl                       Unicode character database
+lib/unicode/Is/BidiB.pl                                Unicode character database
+lib/unicode/Is/BidiCS.pl                       Unicode character database
+lib/unicode/Is/BidiEN.pl                       Unicode character database
+lib/unicode/Is/BidiES.pl                       Unicode character database
+lib/unicode/Is/BidiET.pl                       Unicode character database
+lib/unicode/Is/BidiL.pl                                Unicode character database
+lib/unicode/Is/BidiON.pl                       Unicode character database
+lib/unicode/Is/BidiR.pl                                Unicode character database
+lib/unicode/Is/BidiS.pl                                Unicode character database
+lib/unicode/Is/BidiWS.pl                       Unicode character database
+lib/unicode/Is/C.pl                            Unicode character database
+lib/unicode/Is/Cc.pl                           Unicode character database
+lib/unicode/Is/Cn.pl                           Unicode character database
+lib/unicode/Is/Co.pl                           Unicode character database
+lib/unicode/Is/DCcircle.pl                     Unicode character database
+lib/unicode/Is/DCcompat.pl                     Unicode character database
+lib/unicode/Is/DCfinal.pl                      Unicode character database
+lib/unicode/Is/DCfont.pl                       Unicode character database
+lib/unicode/Is/DCinital.pl                     Unicode character database
+lib/unicode/Is/DCinitial.pl                    Unicode character database
+lib/unicode/Is/DCisolated.pl                   Unicode character database
+lib/unicode/Is/DCnarrow.pl                     Unicode character database
+lib/unicode/Is/DCnoBreak.pl                    Unicode character database
+lib/unicode/Is/DCsmall.pl                      Unicode character database
+lib/unicode/Is/DCsquare.pl                     Unicode character database
+lib/unicode/Is/DCsub.pl                                Unicode character database
+lib/unicode/Is/DCsuper.pl                      Unicode character database
+lib/unicode/Is/DCvertical.pl                   Unicode character database
+lib/unicode/Is/DCwide.pl                       Unicode character database
+lib/unicode/Is/DecoCanon.pl                    Unicode character database
+lib/unicode/Is/DecoCompat.pl                   Unicode character database
+lib/unicode/Is/Digit.pl                                Unicode character database
+lib/unicode/Is/L.pl                            Unicode character database
+lib/unicode/Is/Ll.pl                           Unicode character database
+lib/unicode/Is/Lm.pl                           Unicode character database
+lib/unicode/Is/Lo.pl                           Unicode character database
+lib/unicode/Is/Lower.pl                                Unicode character database
+lib/unicode/Is/Lt.pl                           Unicode character database
+lib/unicode/Is/Lu.pl                           Unicode character database
+lib/unicode/Is/M.pl                            Unicode character database
+lib/unicode/Is/Mc.pl                           Unicode character database
+lib/unicode/Is/Mirrored.pl                     Unicode character database
+lib/unicode/Is/Mn.pl                           Unicode character database
+lib/unicode/Is/N.pl                            Unicode character database
+lib/unicode/Is/Nd.pl                           Unicode character database
+lib/unicode/Is/No.pl                           Unicode character database
+lib/unicode/Is/P.pl                            Unicode character database
+lib/unicode/Is/Pd.pl                           Unicode character database
+lib/unicode/Is/Pe.pl                           Unicode character database
+lib/unicode/Is/Po.pl                           Unicode character database
+lib/unicode/Is/Print.pl                                Unicode character database
+lib/unicode/Is/Ps.pl                           Unicode character database
+lib/unicode/Is/S.pl                            Unicode character database
+lib/unicode/Is/Sc.pl                           Unicode character database
+lib/unicode/Is/Sm.pl                           Unicode character database
+lib/unicode/Is/So.pl                           Unicode character database
+lib/unicode/Is/Space.pl                                Unicode character database
+lib/unicode/Is/Upper.pl                                Unicode character database
+lib/unicode/Is/Z.pl                            Unicode character database
+lib/unicode/Is/Zl.pl                           Unicode character database
+lib/unicode/Is/Zp.pl                           Unicode character database
+lib/unicode/Is/Zs.pl                           Unicode character database
+lib/unicode/JamoShort.pl                       Unicode character database
+lib/unicode/Makefile                           Unicode character database
+lib/unicode/Name.pl                            Unicode character database
+lib/unicode/Number.pl                          Unicode character database
+lib/unicode/To/Digit.pl                                Unicode character database
+lib/unicode/To/Lower.pl                                Unicode character database
+lib/unicode/To/Title.pl                                Unicode character database
+lib/unicode/To/Upper.pl                                Unicode character database
+lib/unicode/UnicodeData-Latest.txt             Unicode character database
+lib/unicode/arabshp.txt                                Unicode character database
+lib/unicode/blocks.txt                         Unicode character database
+lib/unicode/index2.txt                         Unicode character database
+lib/unicode/jamo2.txt                          Unicode character database
+lib/unicode/mktables.PL                                Unicode character database generator
+lib/unicode/names2.txt                         Unicode character database
+lib/unicode/props2.txt                         Unicode character database
+lib/unicode/readme.txt                         Unicode character database info
+lib/utf8.pm                                    Pragma to control Unicode support
+lib/utf8_heavy.pl                              Support routines for utf8 pragma
 lib/validate.pl                Perl library supporting wholesale file mode validation
 lib/vars.pm            Declare pseudo-imported global variables
 makeaperl.SH           perl script that produces a new perl binary
@@ -769,6 +928,7 @@ sv.c                        Scalar value code
 sv.h                   Scalar value header
 t/README               Instructions for regression tests
 t/TEST                 The regression tester
+t/UTEST                        Run regression tests with -Mutf8
 t/base/cond.t          See if conditionals work
 t/base/if.t            See if if works
 t/base/lex.t           See if lexical items work
@@ -971,6 +1131,8 @@ thread.sym         Symbols for threads
 toke.c                 The tokener
 universal.c            The default UNIVERSAL package methods
 unixish.h              Defines that are assumed on Unix
+utf8.c                 Unicode routines
+utf8.h                 Unicode header
 util.c                 Utility routines
 util.h                 Dummy header
 utils/Makefile         Extract the utility scripts
diff --git a/embed.h b/embed.h
index 6026c18..ef19977 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define utf8_distance          Perl_utf8_distance
 #define utf8_hop               Perl_utf8_hop
 #define utf8_to_uv             Perl_utf8_to_uv
+#define utf8skip               Perl_utf8skip
 #define utilize                        Perl_utilize
 #define uv_to_utf8             Perl_uv_to_utf8
 #define varies                 Perl_varies
index ac13e65..ef16b8a 100644 (file)
@@ -1102,6 +1102,7 @@ utf16_to_utf8_reversed
 utf8_distance
 utf8_hop
 utf8_to_uv
+utf8skip
 utilize
 uv_to_utf8
 wait4pid
diff --git a/proto.h b/proto.h
index acd88d0..7ee3cb4 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -194,6 +194,28 @@ VIRTUAL U32        intro_my _((void));
 VIRTUAL char*  instr _((char* big, char* little));
 VIRTUAL bool   io_close _((IO* io));
 VIRTUAL OP*    invert _((OP* cmd));
+VIRTUAL bool   is_uni_alnum _((U32 c));
+VIRTUAL bool   is_uni_idfirst _((U32 c));
+VIRTUAL bool   is_uni_alpha _((U32 c));
+VIRTUAL bool   is_uni_space _((U32 c));
+VIRTUAL bool   is_uni_digit _((U32 c));
+VIRTUAL bool   is_uni_upper _((U32 c));
+VIRTUAL bool   is_uni_lower _((U32 c));
+VIRTUAL bool   is_uni_print _((U32 c));
+VIRTUAL U32    to_uni_upper _((U32 c));
+VIRTUAL U32    to_uni_title _((U32 c));
+VIRTUAL U32    to_uni_lower _((U32 c));
+VIRTUAL bool   is_uni_alnum_lc _((U32 c));
+VIRTUAL bool   is_uni_idfirst_lc _((U32 c));
+VIRTUAL bool   is_uni_alpha_lc _((U32 c));
+VIRTUAL bool   is_uni_space_lc _((U32 c));
+VIRTUAL bool   is_uni_digit_lc _((U32 c));
+VIRTUAL bool   is_uni_upper_lc _((U32 c));
+VIRTUAL bool   is_uni_lower_lc _((U32 c));
+VIRTUAL bool   is_uni_print_lc _((U32 c));
+VIRTUAL U32    to_uni_upper_lc _((U32 c));
+VIRTUAL U32    to_uni_title_lc _((U32 c));
+VIRTUAL U32    to_uni_lower_lc _((U32 c));
 VIRTUAL bool   is_utf8_alnum _((unsigned char *p));
 VIRTUAL bool   is_utf8_idfirst _((unsigned char *p));
 VIRTUAL bool   is_utf8_alpha _((unsigned char *p));
index fe9f833..400843b 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -420,7 +420,7 @@ regexec_flags(register regexp *prog, char *stringarg, register char *strend,
        I32 back_min = 
            prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
        I32 delta = back_max - back_min;
-       char *last = HOP(strend, -(CHR_SVLEN(must) + back_min)); /* Cannot start after this */
+       char *last = HOP(strend, 0-(CHR_SVLEN(must) + back_min)); /* Cannot start after this */
        char *last1;            /* Last position checked before */
 
        if (s > PL_bostr)
@@ -1130,7 +1130,7 @@ regmatch(regnode *prog)
            break;
        case SANYUTF8:
            if (nextchr & 0x80) {
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                if (locinput > PL_regeol)
                    sayNO;
                nextchr = UCHARAT(locinput);
@@ -1147,7 +1147,7 @@ regmatch(regnode *prog)
            break;
        case ANYUTF8:
            if (nextchr & 0x80) {
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                if (locinput > PL_regeol)
                    sayNO;
                nextchr = UCHARAT(locinput);
@@ -1219,7 +1219,7 @@ regmatch(regnode *prog)
                sayNO;
            if (locinput >= PL_regeol)
                sayNO;
-           locinput += PL_utf8skip[nextchr];
+           locinput += utf8skip[nextchr];
            nextchr = UCHARAT(locinput);
            break;
        case ANYOF:
@@ -1253,7 +1253,7 @@ regmatch(regnode *prog)
                if (!(OP(scan) == ALNUMUTF8
                      ? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput)))
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                nextchr = UCHARAT(locinput);
                break;
            }
@@ -1283,7 +1283,7 @@ regmatch(regnode *prog)
                if (OP(scan) == NALNUMUTF8
                      ? swash_fetch(PL_utf8_alnum, locinput) : isALNUM_LC_utf8(locinput))
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                nextchr = UCHARAT(locinput);
                break;
            }
@@ -1351,7 +1351,7 @@ regmatch(regnode *prog)
                if (!(OP(scan) == SPACEUTF8
                      ? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput)))
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                nextchr = UCHARAT(locinput);
                break;
            }
@@ -1381,7 +1381,7 @@ regmatch(regnode *prog)
                if (OP(scan) == NSPACEUTF8
                      ? swash_fetch(PL_utf8_space,locinput) : isSPACE_LC_utf8(locinput))
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                nextchr = UCHARAT(locinput);
                break;
            }
@@ -1399,7 +1399,7 @@ regmatch(regnode *prog)
            if (nextchr & 0x80) {
                if (!(swash_fetch(PL_utf8_digit,locinput)))
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                nextchr = UCHARAT(locinput);
                break;
            }
@@ -1420,7 +1420,7 @@ regmatch(regnode *prog)
            if (nextchr & 0x80) {
                if (swash_fetch(PL_utf8_digit,locinput))
                    sayNO;
-               locinput += PL_utf8skip[nextchr];
+               locinput += utf8skip[nextchr];
                nextchr = UCHARAT(locinput);
                break;
            }
@@ -1431,7 +1431,7 @@ regmatch(regnode *prog)
        case CLUMP:
            if (locinput >= PL_regeol || swash_fetch(PL_utf8_mark, locinput))
                sayNO;
-           locinput += PL_utf8skip[nextchr];
+           locinput += utf8skip[nextchr];
            while (locinput < PL_regeol && swash_fetch(PL_utf8_mark, locinput))
                locinput += UTF8SKIP(locinput);
            if (locinput > PL_regeol)
diff --git a/toke.c b/toke.c
index 9f96319..13cc965 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -915,7 +915,7 @@ scan_const(char *start)
            /* range begins (ignore - as first or last char) */
            else if (*s == '-' && s+1 < send  && s != start) {
                if (utf) {
-                   *d++ = 0xff;        /* use illegal utf8 byte--see pmtrans */
+                   *d++ = (char)0xff;  /* use illegal utf8 byte--see pmtrans */
                    s++;
                    continue;
                }
diff --git a/utf8.h b/utf8.h
index 6f86f72..f39e340 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -8,7 +8,7 @@
  */
 
 #ifdef DOINIT
-EXTCONST unsigned char PL_utf8skip[] = {
+EXTCONST unsigned char utf8skip[] = {
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
@@ -19,9 +19,9 @@ EXTCONST unsigned char PL_utf8skip[] = {
 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,7,8, /* cjk etc. */
 };
 #else
-EXTCONST unsigned char PL_utf8skip[];
+EXTCONST unsigned char utf8skip[];
 #endif
 
 #define IN_UTF8 (curcop->op_private & HINT_UTF8)
 
-#define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
+#define UTF8SKIP(s) utf8skip[*(U8*)s]
index 8570f5d..addf487 100644 (file)
@@ -354,6 +354,7 @@ MICROCORE_SRC       =               \
                ..\taint.c      \
                ..\toke.c       \
                ..\universal.c  \
+               ..\utf8.c       \
                ..\util.c
 
 !IF "$(PERL_MALLOC)" == "define"
@@ -427,6 +428,7 @@ CORE_NOCFG_H        =               \
                ..\sv.h         \
                ..\thread.h     \
                ..\unixish.h    \
+               ..\utf8.h       \
                ..\util.h       \
                ..\XSUB.h       \
                ..\EXTERN.h     \
index ad24e21..50cdda9 100644 (file)
@@ -198,7 +198,7 @@ OPTIMIZE    = -O2 $(RUNTIME)
 LINK_DBG       = 
 .ENDIF
 
-CFLAGS         = -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \
+CFLAGS         = -K -w -d -tWM -tWD $(INCLUDES) $(DEFINES) $(LOCDEFS) \
                $(PCHFLAGS) $(OPTIMIZE)
 LINK_FLAGS     = $(LINK_DBG) -L$(CCLIBDIR) $(EXTRALIBDIRS:^"-L")
 OBJOUT_FLAG    = -o
@@ -471,6 +471,7 @@ MICROCORE_SRC       =               \
                ..\taint.c      \
                ..\toke.c       \
                ..\universal.c  \
+               ..\utf8.c       \
                ..\util.c
 
 .IF "$(PERL_MALLOC)" == "define"
@@ -544,6 +545,7 @@ CORE_NOCFG_H        =               \
                ..\sv.h         \
                ..\thread.h     \
                ..\unixish.h    \
+               ..\utf8.h       \
                ..\util.h       \
                ..\XSUB.h       \
                ..\EXTERN.h     \