From: Jarkko Hietaniemi Date: Sun, 7 Apr 2002 14:25:28 +0000 (+0000) Subject: There was no nice way of getting in UTF-8 filenames: X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=dd36d13c89140c2d9d7954b9f1de583003154c13;p=p5sagit%2Fp5-mst-13.2.git There was no nice way of getting in UTF-8 filenames: now one can use in the (new) three-arg form of readdir() and in File::Glob import a ":utf8" to transparently accept the filenames as Unicode. Note that only :utf8 is supported, not fancier stuff like :encoding(foobar) p4raw-id: //depot/perl@15776 --- diff --git a/MANIFEST b/MANIFEST index be6e311..9ffb4af 100644 --- a/MANIFEST +++ b/MANIFEST @@ -382,6 +382,7 @@ ext/File/Glob/t/basic.t See if File::Glob works ext/File/Glob/t/case.t See if File::Glob works ext/File/Glob/t/global.t See if File::Glob works ext/File/Glob/t/taint.t See if File::Glob works +ext/File/Glob/t/utf8.t See if File::Glob works ext/File/Glob/TODO File::Glob extension todo list ext/Filter/t/call.t See if Filter::Util::Call works ext/Filter/Util/Call/Call.pm Filter::Util::Call extension module diff --git a/ext/File/Glob/Glob.pm b/ext/File/Glob/Glob.pm index a704b56..4c34d38 100644 --- a/ext/File/Glob/Glob.pm +++ b/ext/File/Glob/Glob.pm @@ -31,6 +31,7 @@ use XSLoader (); GLOB_NOSPACE GLOB_QUOTE GLOB_TILDE + GLOB_UTF8 ); %EXPORT_TAGS = ( @@ -51,6 +52,7 @@ use XSLoader (); GLOB_NOSPACE GLOB_QUOTE GLOB_TILDE + GLOB_UTF8 glob bsd_glob ) ], @@ -62,10 +64,11 @@ sub import { require Exporter; my $i = 1; while ($i < @_) { - if ($_[$i] =~ /^:(case|nocase|globally)$/) { + if ($_[$i] =~ /^:(case|nocase|globally|utf8)$/) { splice(@_, $i, 1); $DEFAULT_FLAGS &= ~GLOB_NOCASE() if $1 eq 'case'; $DEFAULT_FLAGS |= GLOB_NOCASE() if $1 eq 'nocase'; + $DEFAULT_FLAGS |= GLOB_UTF8() if $1 eq 'utf8'; if ($1 eq 'globally') { local $^W; *CORE::GLOBAL::glob = \&File::Glob::csh_glob; @@ -203,15 +206,19 @@ File::Glob - Perl extension for BSD glob routine ## override the core glob (CORE::glob() does this automatically ## by default anyway, since v5.6.0) use File::Glob ':globally'; - my @sources = <*.{c,h,y}> + my @sources = <*.{c,h,y}>; ## override the core glob, forcing case sensitivity use File::Glob qw(:globally :case); - my @sources = <*.{c,h,y}> + my @sources = <*.{c,h,y}>; ## override the core glob forcing case insensitivity use File::Glob qw(:globally :nocase); - my @sources = <*.{c,h,y}> + my @sources = <*.{c,h,y}>; + + ## override the core glob forcing UTF-8 names + use File::Glob qw(:globally :utf8); + my @sources = <*.{c,h,y}>; =head1 DESCRIPTION @@ -321,6 +328,18 @@ order (case does not matter) rather than in ASCII order. =back +The following flag has been added in the Perl implementation for +Unicode compatibility: + +=over 4 + +=item C + +The filenames returned will be marked as being in UTF-8 encoding of +Unicode. Note that it is your responsibility to ascertain that the +filesystem you are globbing in returns valid UTF-8 filenames. +The encoding pragma affects this feature, see L. + =head1 DIAGNOSTICS bsd_glob() returns a list of matching paths, possibly zero length. If an diff --git a/ext/File/Glob/Glob.xs b/ext/File/Glob/Glob.xs index bc58b6a..5d95666 100644 --- a/ext/File/Glob/Glob.xs +++ b/ext/File/Glob/Glob.xs @@ -63,6 +63,8 @@ PPCODE: strlen(pglob.gl_pathv[i]))); TAINT; SvTAINT(tmp); + if (pglob.gl_flags & GLOB_UTF8) + sv_utf8_upgrade(tmp); PUSHs(tmp); } diff --git a/ext/File/Glob/Makefile.PL b/ext/File/Glob/Makefile.PL index 0d4267a..a24b663 100644 --- a/ext/File/Glob/Makefile.PL +++ b/ext/File/Glob/Makefile.PL @@ -26,7 +26,7 @@ WriteConstants( NAME => 'File::Glob', NAMES => [qw(GLOB_ABEND GLOB_ALPHASORT GLOB_ALTDIRFUNC GLOB_BRACE GLOB_ERR GLOB_LIMIT GLOB_MARK GLOB_NOCASE GLOB_NOCHECK GLOB_NOMAGIC - GLOB_NOSORT GLOB_NOSPACE GLOB_QUOTE GLOB_TILDE), + GLOB_NOSORT GLOB_NOSPACE GLOB_QUOTE GLOB_TILDE GLOB_UTF8), {name=>"GLOB_ERROR", macro=>["#ifdef GLOB_ERROR\n\tdMY_CXT;\n\n","#endif\n"]}], BREAKOUT_AT => 8, ); diff --git a/ext/File/Glob/bsd_glob.h b/ext/File/Glob/bsd_glob.h index af92c04..ef46c00 100644 --- a/ext/File/Glob/bsd_glob.h +++ b/ext/File/Glob/bsd_glob.h @@ -76,6 +76,7 @@ typedef struct { #define GLOB_ALPHASORT 0x2000 /* Alphabetic, not ASCII sort, like csh. */ #define GLOB_LIMIT 0x4000 /* Limit pattern match output to ARG_MAX (usually from limits.h). */ +#define GLOB_UTF8 0x8000 /* Return UTF-8. (Perl extension.) */ #define GLOB_NOSPACE (-1) /* Malloc call failed. */ #define GLOB_ABEND (-2) /* Unignored error. */ diff --git a/ext/File/Glob/t/utf8.t b/ext/File/Glob/t/utf8.t new file mode 100644 index 0000000..97d0b2c --- /dev/null +++ b/ext/File/Glob/t/utf8.t @@ -0,0 +1,17 @@ +use File::Glob qw(:globally :utf8); + +# Can't really depend on Tru64 UTF-8 filenames being so must just see +# that things don't crash and that *if* UTF-8 were to be received, it's +# valid. (Maybe later add checks that are run if we are on NTFS/HFS+.) +# (see also t/op/readdir.t) + +print "1..2\n"; + +my $a = <*>; + +print utf8::valid($a) ? "ok 1\n" : "not ok 1\n"; + +my @a=<*>; + +print utf8::valid($a[0]) ? "ok 2\n" : "not ok 2\n"; + diff --git a/opcode.h b/opcode.h index 8556e1e..01b89da 100644 --- a/opcode.h +++ b/opcode.h @@ -1744,7 +1744,7 @@ EXT U32 PL_opargs[] = { 0x0001368c, /* readlink */ 0x0012291c, /* mkdir */ 0x0001379c, /* rmdir */ - 0x0002c814, /* open_dir */ + 0x0122c814, /* open_dir */ 0x0000d600, /* readdir */ 0x0000d60c, /* telldir */ 0x0002c804, /* seekdir */ diff --git a/opcode.pl b/opcode.pl index 61cd0e8..fc90005 100755 --- a/opcode.pl +++ b/opcode.pl @@ -814,7 +814,7 @@ rmdir rmdir ck_fun isTu% S? # Directory calls. -open_dir opendir ck_fun is@ F S +open_dir opendir ck_fun is@ F S S? readdir readdir ck_fun % F telldir telldir ck_fun st% F seekdir seekdir ck_fun s@ F S diff --git a/pod/perldelta.pod b/pod/perldelta.pod index ab644bf..37c2a31 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -319,6 +319,12 @@ contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), the default encoding of your STDIN, STDOUT, and STDERR, and of B, is UTF-8. +=item * + +If your filesystem supports returning UTF-8 encoded filenames, +it is possible to make Perl to understand that the filenames +returned by readdir() and glob() are in Unicode. + =back =head2 Safe Signals diff --git a/pod/perldiag.pod b/pod/perldiag.pod index f22aa80..ee3c617 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -3716,6 +3716,12 @@ what you are doing you can turn off this warning by C. (F) There are no byte-swapping functions for a machine with this byte order. +=item Unknown discipline '%s' + +(F) You specified an unknown I/O discipline. See L for valid +filehandle disciplines and L for valid directory +handle disciplines. + =item Unknown "re" subpragma '%s' (known ones are: %s) You tried to use an unknown subpragma of the "re" pragma. diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index d20851f..c04cc48 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -3024,12 +3024,19 @@ them, and automatically close whenever and however you leave that scope: See L for some details about mixing reading and writing. +=item opendir DIRHANDLE,MODE,EXPR + =item opendir DIRHANDLE,EXPR Opens a directory named EXPR for processing by C, C, C, C, and C. Returns true if successful. DIRHANDLEs have their own namespace separate from FILEHANDLEs. +In three-argument form the middle argument may be C<:utf8> to force +the filenames returned by readdir() to be in UTF-8 encoding of Unicode. +This naturally works only if your filesystem returns UTF-8 filenames. +The encoding pragma affects this feature, see L. + =item ord EXPR =item ord @@ -3728,6 +3735,10 @@ C there, it would have been testing the wrong file. @dots = grep { /^\./ && -f "$some_dir/$_" } readdir(DIR); closedir DIR; +In some filesystems it is possible to return UTF-8 encoded filenames. +To get readdir() to return such filenames, you must use C<:utf8> with +the three-argument form of opendir(), see L. + =item readline EXPR Reads from the filehandle whose typeglob is contained in EXPR. In scalar diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 26e704a..d636700 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -20,6 +20,11 @@ Other encodings can be converted to perl's encoding on input, or from perl's encoding on output by use of the ":encoding(...)" layer. See L. +In some filesystems (for example Microsoft NTFS and Apple HFS+) the +filenames are in UTF-8 . By using opendir() and File::Glob you can +make readdir() and glob() to return the filenames as Unicode, see +L and L for details. + To mark the Perl source itself as being in a particular encoding, see L. diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index dd3064f..736a0e2 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -407,6 +407,11 @@ If you run this code twice, the contents of the F will be twice UTF-8 encoded. A C would have avoided the bug, or explicitly opening also the F for input as UTF-8. +In some filesystems (for example Microsoft NTFS and Apple HFS+) the +filenames are in UTF-8 . By using opendir() and File::Glob you can +make readdir() and glob() to return the filenames as Unicode, see +L and L for details. + B: the C<:utf8> and C<:encoding> features work only if your Perl has been built with the new "perlio" feature. Almost all Perl 5.8 platforms do use "perlio", though: you can see whether diff --git a/pp_sys.c b/pp_sys.c index 6ed8e0a..ea9a6a5 100644 --- a/pp_sys.c +++ b/pp_sys.c @@ -3747,17 +3747,39 @@ PP(pp_open_dir) dSP; STRLEN n_a; char *dirname = POPpx; - GV *gv = (GV*)POPs; - register IO *io = GvIOn(gv); + char *dscp = NULL; + GV *gv; + register IO *io; + bool want_utf8 = FALSE; + + if (MAXARG == 3) + dscp = POPpx; + + gv = (GV*)POPs; + io = GvIOn(gv); if (!io) goto nope; + if (dscp) { + if (*dscp == ':') { + if (strnEQ(dscp + 1, "utf8", 4)) + want_utf8 = TRUE; + else + Perl_croak(aTHX_ "Unknown discipline '%s'", dscp); + } + else + Perl_croak(aTHX_ "Unknown discipline '%s'", dscp); + } + if (IoDIRP(io)) PerlDir_close(IoDIRP(io)); if (!(IoDIRP(io) = PerlDir_open(dirname))) goto nope; + if (want_utf8) + IoFLAGS(io) |= IOf_DIR_UTF8; + RETPUSHYES; nope: if (!errno) @@ -3795,6 +3817,8 @@ PP(pp_readdir) if (!(IoFLAGS(io) & IOf_UNTAINT)) SvTAINTED_on(sv); #endif + if (IoFLAGS(io) & IOf_DIR_UTF8) + sv_utf8_upgrade(sv); XPUSHs(sv_2mortal(sv)); } } @@ -3810,6 +3834,8 @@ PP(pp_readdir) if (!(IoFLAGS(io) & IOf_UNTAINT)) SvTAINTED_on(sv); #endif + if (IoFLAGS(io) & IOf_DIR_UTF8) + sv_utf8_upgrade(sv); XPUSHs(sv_2mortal(sv)); } RETURN; diff --git a/sv.h b/sv.h index 92dec20..7a13905 100644 --- a/sv.h +++ b/sv.h @@ -407,6 +407,7 @@ struct xpvio { #define IOf_UNTAINT 16 /* consider this fp (and its data) "safe" */ #define IOf_NOLINE 32 /* slurped a pseudo-line from empty file */ #define IOf_FAKE_DIRP 64 /* xio_dirp is fake (source filters kludge) */ +#define IOf_DIR_UTF8 128 /* readdir tries to return utf8 */ /* The following macros define implementation-independent predicates on SVs. */ diff --git a/t/op/readdir.t b/t/op/readdir.t index 7cfecdb..8e67b65 100755 --- a/t/op/readdir.t +++ b/t/op/readdir.t @@ -8,7 +8,7 @@ BEGIN { eval 'opendir(NOSUCH, "no/such/directory");'; if ($@) { print "1..0\n"; exit; } -print "1..3\n"; +print "1..6\n"; for $i (1..2000) { local *OP; @@ -43,3 +43,25 @@ while (@R && @G && "op/".$R[0] eq $G[0]) { shift(@G); } if (@R == 0 && @G == 0) { print "ok 3\n"; } else { print "not ok 3\n"; } + +# Can't really depend on Tru64 UTF-8 filenames being so must just see +# that things don't crash and that *if* UTF-8 were to be received, it's +# valid. (Maybe later add checks that are run if we are on NTFS/HFS+.) +# (see also ext/File/Glob/t/utf8.t) + +opendir(OP, ":utf8", "op"); + +my $a = readdir(OP); + +print utf8::valid($a) ? "ok 4\n" : "not ok 4\n"; + +my @a = readdir(OP); + +print utf8::valid($a[0]) ? "ok 5\n" : "not ok 5\n"; + +# But we can check for bogus mode arguments. + +eval { opendir(OP, ":foo", "op") }; + +print $@ =~ /Unknown discipline ':foo'/ ? "ok 6\n" : "not ok 6\n"; +