X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FFile%2FFetch.pm;h=8c8b3f90a573a2fe9bfd6ab1bf2ab7ea901424f6;hb=68c65ec0adda9d2b3fc21f30f68be0ef10de5ad8;hp=12fce7583b315fdecbc8534e79f4494e5567d4db;hpb=79fd8837531c3ec705645385c6a99d6e9c263225;p=p5sagit%2Fp5-mst-13.2.git diff --git a/lib/File/Fetch.pm b/lib/File/Fetch.pm index 12fce75..8c8b3f9 100644 --- a/lib/File/Fetch.pm +++ b/lib/File/Fetch.pm @@ -5,7 +5,6 @@ use FileHandle; use File::Copy; use File::Spec; use File::Spec::Unix; -use File::Fetch::Item; use File::Basename qw[dirname]; use Cwd qw[cwd]; @@ -21,8 +20,12 @@ use vars qw[ $VERBOSE $PREFER_BIN $FROM_EMAIL $USER_AGENT $FTP_PASSIVE $TIMEOUT $DEBUG $WARN ]; -$VERSION = 0.08; -$PREFER_BIN = 0; # XXX TODO implement +use constant QUOTE => do { $^O eq 'MSWin32' ? q["] : q['] }; + + +$VERSION = '0.14'; +$VERSION = eval $VERSION; # avoid warnings with development releases +$PREFER_BIN = 0; # XXX TODO implement $FROM_EMAIL = 'File-Fetch@example.com'; $USER_AGENT = 'File::Fetch/$VERSION'; $BLACKLIST = [qw|ftp|]; @@ -47,10 +50,11 @@ local $Module::Load::Conditional::VERBOSE = 0; local $Module::Load::Conditional::VERBOSE = 0; ### see what OS we are on, important for file:// uris ### -use constant ON_UNIX => ($^O ne 'MSWin32' and - $^O ne 'MacOS' and - $^O ne 'VMS'); - +use constant ON_WIN => ($^O eq 'MSWin32'); +use constant ON_VMS => ($^O eq 'VMS'); +use constant ON_UNIX => (!ON_WIN); +use constant HAS_VOL => (ON_WIN); +use constant HAS_SHARE => (ON_WIN); =pod =head1 NAME @@ -86,6 +90,167 @@ C, or C uri by a number of different means. See the C section further down for details. +=head1 ACCESSORS + +A C object has the following accessors + +=over 4 + +=item $ff->uri + +The uri you passed to the constructor + +=item $ff->scheme + +The scheme from the uri (like 'file', 'http', etc) + +=item $ff->host + +The hostname in the uri. Will be empty if host was originally +'localhost' for a 'file://' url. + +=item $ff->vol + +On operating systems with the concept of a volume the second element +of a file:// is considered to the be volume specification for the file. +Thus on Win32 this routine returns the volume, on other operating +systems this returns nothing. + +On Windows this value may be empty if the uri is to a network share, in +which case the 'share' property will be defined. Additionally, volume +specifications that use '|' as ':' will be converted on read to use ':'. + +On VMS, which has a volume concept, this field will be empty because VMS +file specifications are converted to absolute UNIX format and the volume +information is transparently included. + +=item $ff->share + +On systems with the concept of a network share (currently only Windows) returns +the sharename from a file://// url. On other operating systems returns empty. + +=item $ff->path + +The path from the uri, will be at least a single '/'. + +=item $ff->file + +The name of the remote file. For the local file name, the +result of $ff->output_file will be used. + +=cut + + +########################## +### Object & Accessors ### +########################## + +{ + ### template for new() and autogenerated accessors ### + my $Tmpl = { + scheme => { default => 'http' }, + host => { default => 'localhost' }, + path => { default => '/' }, + file => { required => 1 }, + uri => { required => 1 }, + vol => { default => '' }, # windows for file:// uris + share => { default => '' }, # windows for file:// uris + _error_msg => { no_override => 1 }, + _error_msg_long => { no_override => 1 }, + }; + + for my $method ( keys %$Tmpl ) { + no strict 'refs'; + *$method = sub { + my $self = shift; + $self->{$method} = $_[0] if @_; + return $self->{$method}; + } + } + + sub _create { + my $class = shift; + my %hash = @_; + + my $args = check( $Tmpl, \%hash ) or return; + + bless $args, $class; + + if( lc($args->scheme) ne 'file' and not $args->host ) { + return File::Fetch->_error(loc( + "Hostname required when fetching from '%1'",$args->scheme)); + } + + for (qw[path file]) { + unless( $args->$_() ) { # 5.5.x needs the () + return File::Fetch->_error(loc("No '%1' specified",$_)); + } + } + + return $args; + } +} + +=item $ff->output_file + +The name of the output file. This is the same as $ff->file, +but any query parameters are stripped off. For example: + + http://example.com/index.html?x=y + +would make the output file be C rather than +C. + +=back + +=cut + +sub output_file { + my $self = shift; + my $file = $self->file; + + $file =~ s/\?.*$//g; + + return $file; +} + +### XXX do this or just point to URI::Escape? +# =head2 $esc_uri = $ff->escaped_uri +# +# =cut +# +# ### most of this is stolen straight from URI::escape +# { ### Build a char->hex map +# my %escapes = map { chr($_) => sprintf("%%%02X", $_) } 0..255; +# +# sub escaped_uri { +# my $self = shift; +# my $uri = $self->uri; +# +# ### Default unsafe characters. RFC 2732 ^(uric - reserved) +# $uri =~ s/([^A-Za-z0-9\-_.!~*'()])/ +# $escapes{$1} || $self->_fail_hi($1)/ge; +# +# return $uri; +# } +# +# sub _fail_hi { +# my $self = shift; +# my $char = shift; +# +# $self->_error(loc( +# "Can't escape '%1', try using the '%2' module instead", +# sprintf("\\x{%04X}", ord($char)), 'URI::Escape' +# )); +# } +# +# sub output_file { +# +# } +# +# +# } + =head1 METHODS =head2 $ff = File::Fetch->new( uri => 'http://some.where.com/dir/file.txt' ); @@ -112,11 +277,11 @@ sub new { my $href = __PACKAGE__->_parse_uri( $uri ) or return; ### make it into a FFI object ### - my $ffi = File::Fetch::Item->new( %$href ) or return; + my $ff = File::Fetch->_create( %$href ) or return; ### return the object ### - return $ffi; + return $ff; } ### parses an uri to a hash structure: @@ -132,6 +297,34 @@ sub new { ### file => 'index.html' ### }; ### +### In the case of file:// urls there maybe be additional fields +### +### For systems with volume specifications such as Win32 there will be +### a volume specifier provided in the 'vol' field. +### +### 'vol' => 'volumename' +### +### For windows file shares there may be a 'share' key specified +### +### 'share' => 'sharename' +### +### Note that the rules of what a file:// url means vary by the operating system +### of the host being addressed. Thus file:///d|/foo/bar.txt means the obvious +### 'D:\foo\bar.txt' on windows, but on unix it means '/d|/foo/bar.txt' and +### not '/foo/bar.txt' +### +### Similarly if the host interpreting the url is VMS then +### file:///disk$user/my/notes/note12345.txt' means +### 'DISK$USER:[MY.NOTES]NOTE123456.TXT' but will be returned the same as +### if it is unix where it means /disk$user/my/notes/note12345.txt'. +### Except for some cases in the File::Spec methods, Perl on VMS will generally +### handle UNIX format file specifications. +### +### This means it is impossible to serve certain file:// urls on certain systems. +### +### Thus are the problems with a protocol-less specification. :-( +### + sub _parse_uri { my $self = shift; my $uri = shift or return; @@ -142,13 +335,52 @@ sub _parse_uri { $uri =~ s|^(\w+)://||; $href->{scheme} = $1; - ### file:// paths have no host ### + ### See rfc 1738 section 3.10 + ### http://www.faqs.org/rfcs/rfc1738.html + ### And wikipedia for more on windows file:// urls + ### http://en.wikipedia.org/wiki/File:// if( $href->{scheme} eq 'file' ) { - $href->{path} = $uri; - $href->{host} = ''; + + my @parts = split '/',$uri; + + ### file://hostname/... + ### file://hostname/... + ### normalize file://localhost with file:/// + $href->{host} = $parts[0] || ''; + + ### index in @parts where the path components begin; + my $index = 1; + + ### file:////hostname/sharename/blah.txt + if ( HAS_SHARE and not length $parts[0] and not length $parts[1] ) { + + $href->{host} = $parts[2] || ''; # avoid warnings + $href->{share} = $parts[3] || ''; # avoid warnings + + $index = 4 # index after the share + + ### file:///D|/blah.txt + ### file:///D:/blah.txt + } elsif (HAS_VOL) { + + ### this code comes from dmq's patch, but: + ### XXX if volume is empty, wouldn't that be an error? --kane + ### if so, our file://localhost test needs to be fixed as wel + $href->{vol} = $parts[1] || ''; + + ### correct D| style colume descriptors + $href->{vol} =~ s/\A([A-Z])\|\z/$1:/i if ON_WIN; + + $index = 2; # index after the volume + } + + ### rebuild the path from the leftover parts; + $href->{path} = join '/', '', splice( @parts, $index, $#parts ); } else { - @{$href}{qw|host path|} = $uri =~ m|([^/]*)(/.*)$|s; + ### using anything but qw() in hash slices may produce warnings + ### in older perls :-( + @{$href}{ qw(host path) } = $uri =~ m|([^/]*)(/.*)$|s; } ### split the path into file + dir ### @@ -157,6 +389,10 @@ sub _parse_uri { $href->{file} = $parts[2]; } + ### host will be empty if the target was 'localhost' and the + ### scheme was 'file' + $href->{host} = '' if ($href->{host} eq 'localhost') and + ($href->{scheme} eq 'file'); return $href; } @@ -182,6 +418,9 @@ sub fetch { check( $tmpl, \%hash ) or return; + ### On VMS force to VMS format so File::Spec will work. + $to = VMS::Filespec::vmspath($to) if ON_VMS; + ### create the path if it doesn't exist yet ### unless( -d $to ) { eval { mkpath( $to ) }; @@ -192,7 +431,11 @@ sub fetch { ### set passive ftp if required ### local $ENV{FTP_PASSIVE} = $FTP_PASSIVE; - ### + ### we dont use catfile on win32 because if we are using a cygwin tool + ### under cmd.exe they wont understand windows style separators. + my $out_to = ON_WIN ? $to.'/'.$self->output_file + : File::Spec->catfile( $to, $self->output_file ); + for my $method ( @{ $METHODS->{$self->scheme} } ) { my $sub = '_'.$method.'_fetch'; @@ -208,7 +451,17 @@ sub fetch { ### method is known to fail ### next if $METHOD_FAIL->{$method}; - if(my $file = $self->$sub(to=>File::Spec->catfile($to,$self->file))){ + ### there's serious issues with IPC::Run and quoting of command + ### line arguments. using quotes in the wrong place breaks things, + ### and in the case of say, + ### C:\cygwin\bin\wget.EXE --quiet --passive-ftp --output-document + ### "index.html" "http://www.cpan.org/index.html?q=1&y=2" + ### it doesn't matter how you quote, it always fails. + local $IPC::Cmd::USE_IPC_RUN = 0; + + if( my $file = $self->$sub( + to => $out_to + )){ unless( -e $file && -s _ ) { $self->_error(loc("'%1' said it fetched '%2', ". @@ -233,37 +486,6 @@ sub fetch { return; } -=head1 ACCESSORS - -A C object has the following accessors - -=over 4 - -=item $ff->uri - -The uri you passed to the constructor - -=item $ff->scheme - -The scheme from the uri (like 'file', 'http', etc) - -=item $ff->host - -The hostname in the uri, will be empty for a 'file' scheme. - -=item $ff->path - -The path from the uri, will be at least a single '/'. - -=item $ff->file - -The name of the remote file. Will be used as the name for the local -file as well. - -=back - -=cut - ######################## ### _*_fetch methods ### ######################## @@ -404,11 +626,18 @@ sub _wget_fetch { push @$cmd, '--passive-ftp' if $FTP_PASSIVE; ### set the output document, add the uri ### - push @$cmd, '--output-document', $to, $self->uri; + push @$cmd, '--output-document', + ### DO NOT quote things for IPC::Run, it breaks stuff. + $IPC::Cmd::USE_IPC_RUN + ? ($to, $self->uri) + : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE); ### shell out ### my $captured; - unless( run( command => $cmd, buffer => \$captured, verbose => 0 ) ) { + unless(run( command => $cmd, + buffer => \$captured, + verbose => $DEBUG + )) { ### wget creates the output document always, even if the fetch ### fails.. so unlink it in that case 1 while unlink $to; @@ -436,7 +665,7 @@ sub _ftp_fetch { }; check( $tmpl, \%hash ) or return; - ### see if we have a wget binary ### + ### see if we have a ftp binary ### if( my $ftp = can_run('ftp') ) { my $fh = FileHandle->new; @@ -454,7 +683,7 @@ sub _ftp_fetch { "cd /", "cd " . $self->path, "binary", - "get " . $self->file . " " . $self->file, + "get " . $self->file . " " . $self->output_file, "quit", ); @@ -477,9 +706,16 @@ sub _lynx_fetch { }; check( $tmpl, \%hash ) or return; - ### see if we have a wget binary ### + ### see if we have a lynx binary ### if( my $lynx = can_run('lynx') ) { + unless( IPC::Cmd->can_capture_buffer ) { + $METHOD_FAIL->{'lynx'} = 1; + + return $self->_error(loc( + "Can not capture buffers. Can not use '%1' to fetch files", + 'lynx' )); + } ### write to the output file ourselves, since lynx ass_u_mes to much my $local = FileHandle->new(">$to") @@ -495,7 +731,11 @@ sub _lynx_fetch { push @$cmd, "-connect_timeout=$TIMEOUT" if $TIMEOUT; - push @$cmd, $self->uri; + ### DO NOT quote things for IPC::Run, it breaks stuff. + push @$cmd, $IPC::Cmd::USE_IPC_RUN + ? $self->uri + : QUOTE. $self->uri .QUOTE; + ### shell out ### my $captured; @@ -540,7 +780,7 @@ sub _ncftp_fetch { ### if $FTP_PASSIVE is set return if $FTP_PASSIVE; - ### see if we have a wget binary ### + ### see if we have a ncftp binary ### if( my $ncftp = can_run('ncftp') ) { my $cmd = [ @@ -550,7 +790,12 @@ sub _ncftp_fetch { $self->host, # hostname dirname($to), # local dir for the file # remote path to the file - File::Spec::Unix->catdir( $self->path, $self->file ), + ### DO NOT quote things for IPC::Run, it breaks stuff. + $IPC::Cmd::USE_IPC_RUN + ? File::Spec::Unix->catdir( $self->path, $self->file ) + : QUOTE. File::Spec::Unix->catdir( + $self->path, $self->file ) .QUOTE + ]; ### shell out ### @@ -597,7 +842,11 @@ sub _curl_fetch { ### curl doesn't follow 302 (temporarily moved) etc automatically ### so we add --location to enable that. - push @$cmd, '--fail', '--location', '--output', $to, $self->uri; + push @$cmd, '--fail', '--location', '--output', + ### DO NOT quote things for IPC::Run, it breaks stuff. + $IPC::Cmd::USE_IPC_RUN + ? ($to, $self->uri) + : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE); my $captured; unless(run( command => $cmd, @@ -618,8 +867,11 @@ sub _curl_fetch { ### use File::Copy for fetching file:// urls ### -### XXX file:// uri to local path conversion is just too weird... -### depend on LWP to do it for us +### +### See section 3.10 of RFC 1738 (http://www.faqs.org/rfcs/rfc1738.html) +### Also see wikipedia on file:// (http://en.wikipedia.org/wiki/File://) +### + sub _file_fetch { my $self = shift; my %hash = @_; @@ -630,14 +882,50 @@ sub _file_fetch { }; check( $tmpl, \%hash ) or return; + + ### prefix a / on unix systems with a file uri, since it would ### look somewhat like this: - ### file://home/kane/file - ### wheras windows file uris might look like: - ### file://C:/home/kane/file - my $path = ON_UNIX ? '/'. $self->path : $self->path; + ### file:///home/kane/file + ### wheras windows file uris for 'c:\some\dir\file' might look like: + ### file:///C:/some/dir/file + ### file:///C|/some/dir/file + ### or for a network share '\\host\share\some\dir\file': + ### file:////host/share/some/dir/file + ### + ### VMS file uri's for 'DISK$USER:[MY.NOTES]NOTE123456.TXT' might look like: + ### file://vms.host.edu/disk$user/my/notes/note12345.txt + ### + + my $path = $self->path; + my $vol = $self->vol; + my $share = $self->share; + + my $remote; + if (!$share and $self->host) { + return $self->_error(loc( + "Currently %1 cannot handle hosts in %2 urls", + 'File::Fetch', 'file://' + )); + } + + if( $vol ) { + $path = File::Spec->catdir( split /\//, $path ); + $remote = File::Spec->catpath( $vol, $path, $self->file); - my $remote = File::Spec->catfile( $path, $self->file ); + } elsif( $share ) { + ### win32 specific, and a share name, so we wont bother with File::Spec + $path =~ s|/+|\\|g; + $remote = "\\\\".$self->host."\\$share\\$path"; + + } else { + ### File::Spec on VMS can not currently handle UNIX syntax. + my $file_class = ON_VMS + ? 'File::Spec::Unix' + : 'File::Spec'; + + $remote = $file_class->catfile( $path, $self->file ); + } ### File::Copy is littered with 'die' statements :( ### my $rv = eval { File::Copy::copy( $remote, $to ) }; @@ -671,7 +959,10 @@ sub _rsync_fetch { push(@$cmd, '--quiet') unless $DEBUG; - push @$cmd, $self->uri, $to; + ### DO NOT quote things for IPC::Run, it breaks stuff. + push @$cmd, $IPC::Cmd::USE_IPC_RUN + ? ($self->uri, $to) + : (QUOTE. $self->uri .QUOTE, QUOTE. $to .QUOTE); my $captured; unless(run( command => $cmd, @@ -679,7 +970,8 @@ sub _rsync_fetch { verbose => $DEBUG ) ) { - return $self->_error(loc("Command failed: %1", $captured || '')); + return $self->_error(loc("Command %1 failed: %2", + "@$cmd" || '', $captured || '')); } return $to; @@ -705,31 +997,25 @@ Pass it a true value to get the C output instead. =cut -### Error handling, the way Archive::Tar does it ### -{ - my $error = ''; - my $longmess = ''; - - sub _error { - my $self = shift; - $error = shift; - $longmess = Carp::longmess($error); - - ### set Archive::Tar::WARN to 0 to disable printing - ### of errors - if( $WARN ) { - carp $DEBUG ? $longmess : $error; - } - - return; +### error handling the way Archive::Extract does it +sub _error { + my $self = shift; + my $error = shift; + + $self->_error_msg( $error ); + $self->_error_msg_long( Carp::longmess($error) ); + + if( $WARN ) { + carp $DEBUG ? $self->_error_msg_long : $self->_error_msg; } - sub error { - my $self = shift; - return shift() ? $longmess : $error; - } + return; } +sub error { + my $self = shift; + return shift() ? $self->_error_msg_long : $self->_error_msg; +} 1; @@ -888,6 +1174,20 @@ between a 'successfull' fetch and a custom error page. Therefor, we recommend to only use C as a last resort. This is why it is at the back of our list of methods to try as well. +=head2 Files I'm trying to fetch have reserved characters or non-ASCII characters in them. What do I do? + +C is relatively smart about things. When trying to write +a file to disk, it removes the C (see the +C method for details) from the file name before creating +it. In most cases this suffices. + +If you have any other characters you need to escape, please install +the C module from CPAN, and pre-encode your URI before +passing it to C. You can read about the details of URIs +and URI encoding here: + + http://www.faqs.org/rfcs/rfc2396.html + =head1 TODO =over 4 @@ -896,20 +1196,21 @@ why it is at the back of our list of methods to try as well. To indicate to rather use commandline tools than modules -=head1 AUTHORS +=back + +=head1 BUG REPORTS + +Please report bugs or other issues to Ebug-file-fetch@rt.cpan.org. + +=head1 AUTHOR -This module by -Jos Boumans Ekane@cpan.orgE. +This module by Jos Boumans Ekane@cpan.orgE. =head1 COPYRIGHT -This module is -copyright (c) 2003 Jos Boumans Ekane@cpan.orgE. -All rights reserved. +This library is free software; you may redistribute and/or modify it +under the same terms as Perl itself. -This library is free software; -you may redistribute and/or modify it under the same -terms as Perl itself. =cut