From: John Napiorkowski Date: Fri, 26 Dec 2014 20:42:10 +0000 (-0600) Subject: changes so that we skip encoding under programmatic situations X-Git-Tag: 5.90079_008~24 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=catagits%2FCatalyst-Runtime.git;a=commitdiff_plain;h=6adc45cf93c50a080f1f32bad475fd2ab5844887 changes so that we skip encoding under programmatic situations --- diff --git a/Changes b/Changes index 7b5b2bf..64897ae 100644 --- a/Changes +++ b/Changes @@ -5,6 +5,16 @@ - Exposed the reqexp we use to match content types that need encoding via a global variable. - Added some test cases for JSON utf8 and tested file uploads with utf8. + - Fixes to decoding on file upload filenames and related methods + - new methods on upload object that tries to do the right thing if we find + a character set on the upload and its UTF8. + - new additional helper methods on the file upload object. + - new helper methods has_encoding and clear_encoding on context. + - Method on Catalyst::Response to determine if the reponse should be encoded. + - Warn if changing headers only if headers are finalized AND the response callback + has allready been called (and headers already sent). + - Centralized rules about detecting if we need to automatically encode or not and + added tests around cases when you choose to skip auto encoding. 5.90079_003 - 2014-12-03 - Make sure all tests run even if debug mode is enabled. diff --git a/lib/Catalyst.pm b/lib/Catalyst.pm index 9592b81..95ddb97 100644 --- a/lib/Catalyst.pm +++ b/lib/Catalyst.pm @@ -113,7 +113,6 @@ our $START = time; our $RECURSION = 1000; our $DETACH = Catalyst::Exception::Detach->new; our $GO = Catalyst::Exception::Go->new; -our $DEFAULT_ENCODE_CONTENT_TYPE_MATCH = qr{text|xml$|javascript$}; #I imagine that very few of these really need to be class variables. if any. #maybe we should just make them attributes with a default? @@ -1014,12 +1013,31 @@ And later: Your log class should implement the methods described in L. +=head2 has_encoding + +Returned True if there's a valid encoding + +=head2 clear_encoding + +Clears the encoding for the current context + =head2 encoding Sets or gets the application encoding. =cut +sub has_encoding { shift->encoding ? 1:0 } + +sub clear_encoding { + my $c = shift; + if(blessed $c) { + $c->encoding(undef); + } else { + $c->debug->error("You can't clear encoding on the application"); + } +} + sub encoding { my $c = shift; my $encoding; @@ -2050,25 +2068,10 @@ sub finalize_headers { $c->finalize_cookies; + # This currently is a NOOP but I don't want to remove it since I guess people + # might have Response subclasses that use it for something... (JNAP) $c->response->finalize_headers(); - if(my $enc = $c->encoding) { - my ($ct, $ct_enc) = $c->response->content_type; - - # Only touch 'text-like' contents - if($c->response->content_type =~ /$DEFAULT_ENCODE_CONTENT_TYPE_MATCH/) { - if ($ct_enc && $ct_enc =~ /charset=([^;]*)/) { - if (uc($1) ne uc($enc->mime_name)) { - $c->log->debug("Catalyst encoding config is set to encode in '" . - $enc->mime_name . - "', content type is '$1', not encoding "); - } - } else { - $c->res->content_type($c->res->content_type . "; charset=" . $enc->mime_name); - } - } - } - # Done $response->finalized_headers(1); } @@ -2079,26 +2082,44 @@ Make sure your body is encoded properly IF you set an encoding. By default the encoding is UTF-8 but you can disable it by explictly setting the encoding configuration value to undef. +We can only encode when the body is a scalar. Methods for encoding via the +streaming interfaces (such as C and C on L +are available). + See L. =cut sub finalize_encoding { my $c = shift; + my $res = $c->res || return; + + # Warn if the set charset is different from the one you put into encoding. We need + # to do this early since encodable_response is false for this condition and we need + # to match the debug output for backcompat (there's a test for this...) -JNAP + if( + $res->content_type_charset and $c->encoding and + (uc($c->encoding->mime_name) ne uc($res->content_type_charset)) + ) { + my $ct = lc($res->content_type_charset); + $c->log->debug("Catalyst encoding config is set to encode in '" . + $c->encoding->mime_name . + "', content type is '$ct', not encoding "); + } - my $body = $c->response->body; - - return unless defined($body); - - my $enc = $c->encoding; - - return unless $enc; - - # Only touch 'text-like' contents - if($c->response->content_type =~ /$DEFAULT_ENCODE_CONTENT_TYPE_MATCH/) { - if (ref(\$body) eq 'SCALAR') { - $c->response->body( $c->encoding->encode( $body, $c->_encode_check ) ); - } + if( + ($res->encodable_response) and + (defined($res->body)) and + (ref(\$res->body) eq 'SCALAR') + ) { + $c->res->body( $c->encoding->encode( $c->res->body, $c->_encode_check ) ); + + # Set the charset if necessary. This might be a bit bonkers since encodable response + # is false when the set charset is not the same as the encoding mimetype (maybe + # confusing action at a distance here.. + # Don't try to set the charset if one already exists + $c->res->content_type($c->res->content_type . "; charset=" . $c->encoding->mime_name) + unless($c->res->content_type_charset); } } @@ -4007,20 +4028,27 @@ matches the following regular expression: $content_type =~ /^text|xml$|javascript$/ -The value of this regex is contained in the global variable - - $Catalyst::DEFAULT_ENCODE_CONTENT_TYPE_MATCH +Encoding is set on the application, but it is copied to the response object +so you can override encoding rules per request (See L +for more information). -This may change in the future. Be default we don't automatically -encode 'application/json' since the most popular JSON encoders (such -as L which is the library that L can make use -of) will do the UTF8 encoding and decoding automatically. Having it on -in Catalyst could result in double encoding. +Be default we don't automatically encode 'application/json' since the most +popular JSON encoders (such as L which is the library that +L can make use of) will do the UTF8 encoding and decoding automatically. +Having it on in Catalyst could result in double encoding. If you are producing JSON response in an unconventional manner (such as via a template or manual strings) you should perform the UTF8 encoding manually as well such as to conform to the JSON specification. +NOTE: We also examine the value of $c->response->content_encoding. If +you set this (like for example 'gzip', and manually gzipping the body) +we assume that you have done all the neccessary encoding yourself, since +we cannot encode the gzipped contents. If you use a plugin like +L we will be updating that plugin to work +with the new UTF8 encoding code, or you can use L +or (probably best) do your compression on a front end proxy. + =head2 Methods =over 4 diff --git a/lib/Catalyst/Engine.pm b/lib/Catalyst/Engine.pm index 194d254..948f28f 100644 --- a/lib/Catalyst/Engine.pm +++ b/lib/Catalyst/Engine.pm @@ -685,6 +685,7 @@ sub prepare_uploads { ( size => $upload->{size}, type => scalar $headers->content_type, + charset => scalar $headers->content_type_charset, headers => $headers, tempname => $upload->{tempname}, filename => $filename, diff --git a/lib/Catalyst/Request/Upload.pm b/lib/Catalyst/Request/Upload.pm index d8e58be..6df2dff 100644 --- a/lib/Catalyst/Request/Upload.pm +++ b/lib/Catalyst/Request/Upload.pm @@ -15,6 +15,8 @@ has size => (is => 'rw'); has tempname => (is => 'rw'); has type => (is => 'rw'); has basename => (is => 'ro', lazy_build => 1); +has raw_basename => (is => 'ro', lazy_build => 1); +has charset => (is=>'ro', predicate=>'has_charset'); has fh => ( is => 'rw', @@ -29,17 +31,21 @@ has fh => ( Catalyst::Exception->throw( message => qq/Can't open '$filename': '$!'/ ); } - return $fh; }, ); sub _build_basename { + my $basename = shift->raw_basename; + $basename =~ s|[^\w\.-]+|_|g; + return $basename; +} + +sub _build_raw_basename { my $self = shift; my $basename = $self->filename; $basename =~ s|\\|/|g; $basename = ( File::Spec::Unix->splitpath($basename) )[2]; - $basename =~ s|[^\w\.-]+|_|g; return $basename; } @@ -58,13 +64,16 @@ Catalyst::Request::Upload - handles file upload requests $upload->basename; $upload->copy_to; $upload->fh; + $upload->decoded_fh $upload->filename; $upload->headers; $upload->link_to; $upload->size; $upload->slurp; + $upload->decoded_slurp; $upload->tempname; $upload->type; + $upload->charset; To specify where Catalyst should put the temporary files, set the 'uploadtmp' option in the Catalyst config. If unset, Catalyst will use the system temp dir. @@ -97,10 +106,56 @@ sub copy_to { return File::Copy::copy( $self->tempname, @_ ); } +=head2 $upload->is_utf8_encoded + +Returns true of the upload defines a character set at that value is 'UTF-8'. +This does not try to inspect your upload and make any guesses if the Content +Type charset is undefined. + +=cut + +sub is_utf8_encoded { + my $self = shift; + if(my $charset = $self->charset) { + return $charset eq 'UTF-8' ? 1 : 0; + } + return 0; +} + =head2 $upload->fh Opens a temporary file (see tempname below) and returns an L handle. +This is a filehandle that is opened with no additional IO Layers. + +=head2 $upload->decoded_fh(?$encoding) + +Returns a filehandle that has binmode set to UTF-8 if a UTF-8 character set +is found. This also accepts an override encoding value that you can use to +force a particular L layer. If neither are found the filehandle is +set to :raw. + +This is useful if you are pulling the file into code and inspecting bit and +maybe then sending those bits back as the response. (Please not this is not +a suitable filehandle to set in the body; use C if you are doing that). + +Please note that using this method sets the underlying filehandle IO layer +so once you use this method if you go back and use the C method you +still get the IO layer applied. + +=cut + +sub decoded_fh { + my ($self, $layer) = @_; + my $fh = $self->fh; + + $layer = ":encoding(UTF-8)" if !$layer && $self->is_utf8_encoded; + $layer = ':raw' unless $layer; + + binmode($fh, $layer); + return $fh; +} + =head2 $upload->filename Returns the client-supplied filename. @@ -127,13 +182,17 @@ sub link_to { Returns the size of the uploaded file in bytes. -=head2 $upload->slurp +=head2 $upload->slurp(?$encoding) + +Optionally accepts an argument to define an IO Layer (which is applied to +the filehandle via binmode; if no layer is defined the default is set to +":raw". Returns a scalar containing the contents of the temporary file. Note that this will cause the filehandle pointed to by C<< $upload->fh >> to be reset to the start of the file using seek and the file handle to be put -into binary mode. +into whatever encoding mode is applied. =cut @@ -158,9 +217,39 @@ sub slurp { return $content; } +=head2 $upload->decoded_slurp(?$encoding) + +Works just like C except we use C instead of C to +open a filehandle to slurp. This means if your upload charset is UTF8 +we binmode the filehandle to that encoding. + +=cut + +sub decoded_slurp { + my ( $self, $layer ) = @_; + my $handle = $self->decoded_fh($layer); + + my $content = undef; + $handle->seek(0, IO::File::SEEK_SET); + while ( $handle->sysread( my $buffer, 8192 ) ) { + $content .= $buffer; + } + + $handle->seek(0, IO::File::SEEK_SET); + return $content; +} + =head2 $upload->basename -Returns basename for C. +Returns basename for C. This filters the name through a regexp +C to make it safe for filesystems that don't +like advanced characters. This will of course filter UTF8 characters. +If you need the exact basename unfiltered use C. + +=head2 $upload->raw_basename + +Just like C but without filtering the filename for characters that +don't always write to a filesystem. =head2 $upload->tempname @@ -170,6 +259,11 @@ Returns the path to the temporary file. Returns the client-supplied Content-Type. +=head2 $upload->charset + +The character set information part of the content type, if any. Useful if you +need to figure out any encodings on the file upload. + =head2 meta Provided by Moose diff --git a/lib/Catalyst/Response.pm b/lib/Catalyst/Response.pm index 0ef4c52..28cf7b7 100644 --- a/lib/Catalyst/Response.pm +++ b/lib/Catalyst/Response.pm @@ -6,9 +6,18 @@ use Moose::Util::TypeConstraints; use namespace::autoclean; use Scalar::Util 'blessed'; use Catalyst::Response::Writer; +use Catalyst::Utils (); with 'MooseX::Emulate::Class::Accessor::Fast'; +our $DEFAULT_ENCODE_CONTENT_TYPE_MATCH = qr{text|xml$|javascript$}; + +has encodable_content_type => ( + is => 'rw', + required => 1, + default => sub { $DEFAULT_ENCODE_CONTENT_TYPE_MATCH } +); + has _response_cb => ( is => 'ro', isa => 'CodeRef', @@ -55,10 +64,10 @@ has write_fh => ( sub _build_write_fh { my $writer = $_[0]->_writer; # We need to get the finalize headers side effect... - my $requires_encoding = $_[0]->content_type =~ m/$Catalyst::DEFAULT_ENCODE_CONTENT_TYPE_MATCH/; + my $requires_encoding = $_[0]->encodable_response; my %fields = ( _writer => $writer, - _encoding => $_[0]->encoding, + _encoding => $_[0]->_context->encoding, _requires_encoding => $requires_encoding, ); @@ -83,7 +92,7 @@ has finalized_headers => (is => 'rw', default => 0); has headers => ( is => 'rw', isa => 'HTTP::Headers', - handles => [qw(content_encoding content_length content_type header)], + handles => [qw(content_encoding content_length content_type content_type_charset header)], default => sub { HTTP::Headers->new() }, required => 1, lazy => 1, @@ -94,15 +103,13 @@ has _context => ( clearer => '_clear_context', ); -has encoding => (is=>'ro'); - before [qw(status headers content_encoding content_length content_type header)] => sub { my $self = shift; $self->_context->log->warn( - "Useless setting a header value after finalize_headers called." . + "Useless setting a header value after finalize_headers and the response callback has been called." . " Not what you want." ) - if ( $self->finalized_headers && @_ ); + if ( $self->finalized_headers && !$self->_has_response_cb && @_ ); }; sub output { shift->body(@_) } @@ -117,8 +124,9 @@ sub write { $buffer = q[] unless defined $buffer; - $buffer = $self->_context->encoding->encode( $buffer, $self->_context->_encode_check ) - if $self->_context->encoding && $self->content_type =~ /$Catalyst::DEFAULT_ENCODE_CONTENT_TYPE_MATCH/; + if($self->encodable_response) { + $buffer = $self->_context->encoding->encode( $buffer, $self->_context->_encode_check ) + } my $len = length($buffer); $self->_writer->write($buffer); @@ -195,6 +203,22 @@ you might want to use a L type of object (Something that implements in the same fashion), or a filehandle GLOB. Catalyst will write it piece by piece into the response. +If you are using a filehandle as the body response you are responsible for +making sure it comforms to the L specification with regards to content +encoding. Unlike with scalar body values or when using the streaming interfaces +we currently do not attempt to normalize and encode your filehandle. In general +this means you should be sure to be sending bytes not UTF8 decoded multibyte +characters. + +Most of the time when you do: + + open(my $fh, '<:raw', $path); + +You should be fine. If you open a filehandle with a L layer you probably +are not fine. You can usually fix this by explicitly using binmode to set +the IOLayer to :raw. Its possible future versions of L will try to +'do the right thing'. + When using a L type of object and no content length has been already set in the response headers Catalyst will make a reasonable attempt to determine the size of the Handle. Depending on the implementation of your @@ -306,6 +330,10 @@ This value is typically set by your view or plugin. For example, L will guess the mime type based on the file it found, while L defaults to C. +=head2 $res->content_type_charset + +Shortcut for $res->headers->content_type_charset; + =head2 $res->cookies Returns a reference to a hash containing cookies to be set. The keys of the @@ -511,6 +539,67 @@ Example: Please note this does not attempt to map or nest your PSGI application under the Controller and Action namespace or path. +=head2 encodable_content_type + +This is a regular expression used to determine of the current content type +should be considered encodable. Currently we apply default encoding (usually +UTF8) to text type contents. Here's the default regular expression: + +This would match content types like: + + text/plain + text/html + text/xml + application/javascript + application/xml + application/vnd.user+xml + +B: We don't encode JSON content type responses by default since most +of the JSON serializers that are commonly used for this task will do so +automatically and we don't want to double encode. If you are not using a +tool like L to produce JSON type content, (for example you are using +a template system, or creating the strings manually) you will need to either +encoding the body yourself: + + $c->response->body( $c->encoding->encode( $body, $c->_encode_check ) ); + +Or you can alter the regular expression using this attribute. + +=head2 encodable_response + +Given a L return true if its one that can be encoded. + + make sure there is an encoding set on the response + make sure the content type is encodable + make sure no content type charset has been already set to something different from the global encoding + make sure no content encoding is present. + +Note this does not inspect a body since we do allow automatic encoding on streaming +type responses. + +=cut + +sub encodable_response { + my ($self) = @_; + return 0 unless $self->_context; # Cases like returning a HTTP Exception response you don't have a context here... + return 0 unless $self->_context->encoding; + + my $has_manual_charset = 0; + if(my $charset = $self->content_type_charset) { + $has_manual_charset = (uc($charset) ne uc($self->_context->encoding->mime_name)) ? 1:0; + } + + if( + ($self->content_type =~ m/${\$self->encodable_content_type}/) and + (!$has_manual_charset) and + (!$self->content_encoding || $self->content_encoding eq 'identity' ) + ) { + return 1; + } else { + return 0; + } +} + =head2 DEMOLISH Ensures that the response is flushed and closed at the end of the diff --git a/lib/Catalyst/Utils.pm b/lib/Catalyst/Utils.pm index 5ee7451..1bccecb 100644 --- a/lib/Catalyst/Utils.pm +++ b/lib/Catalyst/Utils.pm @@ -10,7 +10,6 @@ use Cwd; use Class::Load 'is_class_loaded'; use String::RewritePrefix; use Class::Load (); - use namespace::clean; =head1 NAME @@ -503,6 +502,8 @@ sub apply_registered_middleware { return $new_psgi; } + + =head1 PSGI Helpers Utility functions to make it easier to work with PSGI applications under Catalyst diff --git a/t/lib/TestAppUnicode.pm b/t/lib/TestAppUnicode.pm index 55359f7..8338f3d 100644 --- a/t/lib/TestAppUnicode.pm +++ b/t/lib/TestAppUnicode.pm @@ -3,7 +3,7 @@ use strict; use warnings; use TestLogger; use base qw/Catalyst/; -use Catalyst qw/Unicode::Encoding/; +use Catalyst; __PACKAGE__->config( 'name' => 'TestAppUnicode', diff --git a/t/unicode_plugin_charset_utf8.t b/t/unicode_plugin_charset_utf8.t index 81ba9f7..4c7c0c6 100644 --- a/t/unicode_plugin_charset_utf8.t +++ b/t/unicode_plugin_charset_utf8.t @@ -6,7 +6,7 @@ use lib "$Bin/lib"; use Data::Dumper; BEGIN { - $ENV{TESTAPP_ENCODING} = 'UTF-8'; + # $ENV{TESTAPP_ENCODING} = 'UTF-8'; # This is now default $ENV{TESTAPP_DEBUG} = 0; $ENV{CATALYST_DEBUG} = 0; } @@ -27,6 +27,6 @@ is scalar(@TestLogger::LOGS), 1 or diag Dumper(\@TestLogger::LOGS); like $TestLogger::LOGS[0], qr/content type is 'iso-8859-1'/; -like $TestLogger::ELOGS[0], qr/Unicode::Encoding plugin/; +#like $TestLogger::ELOGS[0], qr/Unicode::Encoding plugin/; #no longer a plugin done_testing; diff --git a/t/utf_incoming.t b/t/utf_incoming.t index 516e9d5..f04dd19 100644 --- a/t/utf_incoming.t +++ b/t/utf_incoming.t @@ -111,12 +111,19 @@ use JSON::MaybeXS; my ($self, $c) = @_; Test::More::is $c->req->body_parameters->{'♥'}, '♥♥'; Test::More::ok my $upload = $c->req->uploads->{file}; + Test::More::is $upload->charset, 'UTF-8'; my $text = $upload->slurp; Test::More::is Encode::decode_utf8($text), "

This is stream_body_fh action ♥

\n"; + my $decoded_text = $upload->decoded_slurp; + Test::More::is $decoded_text, "

This is stream_body_fh action ♥

\n"; + + Test::More::is $upload->filename, '♥ttachment.txt'; + Test::More::is $upload->raw_basename, '♥ttachment.txt'; + $c->response->content_type('text/html'); - $c->response->body($upload->fh); + $c->response->body($decoded_text); } sub json :POST Consumes(JSON) Local { @@ -132,6 +139,28 @@ use JSON::MaybeXS; $c->response->body(JSON::MaybeXS::encode_json($post)); } + ## If someone clears encoding, they can do as they wish + sub manual_1 :Local { + my ($self, $c) = @_; + $c->encoding(undef); + $c->res->content_type('text/plain'); + $c->res->content_type_charset('UTF-8'); + $c->response->body( Encode::encode_utf8("manual_1 ♥")); + } + + ## If you do like gzip, well handle that yourself! Basically if you do some sort + ## of content encoding like gzip, you must do on top of the encoding. We will fix + ## the encoding plugins (Catalyst::Plugin::Compress) to do this properly for you. + # + sub gzipped :Local { + require Compress::Zlib; + my ($self, $c) = @_; + $c->res->content_type('text/plain'); + $c->res->content_type_charset('UTF-8'); + $c->res->content_encoding('gzip'); + $c->response->body(Compress::Zlib::memGzip(Encode::encode_utf8("manual_1 ♥"))); + } + package MyApp; use Catalyst; @@ -255,19 +284,19 @@ use Catalyst::Test 'MyApp'; { my $url = $c->uri_for($c->controller->action_for('heart_with_arg'), '♥'); - is "$url", 'http://localhost/root/a%E2%99%A5/%E2%99%A5'; + is "$url", 'http://localhost/root/a%E2%99%A5/%E2%99%A5', "correct $url"; } { my $url = $c->uri_for($c->controller->action_for('heart_with_arg'), ['♥']); - is "$url", 'http://localhost/root/a%E2%99%A5/%E2%99%A5'; + is "$url", 'http://localhost/root/a%E2%99%A5/%E2%99%A5', "correct $url"; } } { my $res = request "/root/stream_write"; - is $res->code, 200, 'OK'; + is $res->code, 200, 'OK GET /root/stream_write'; is decode_utf8($res->content), '

This is stream_write action ♥

', 'correct body'; is $res->content_charset, 'UTF-8'; } @@ -304,7 +333,7 @@ use Catalyst::Test 'MyApp'; ok my $path = File::Spec->catfile('t', 'utf8.txt'); ok my $req = POST '/root/file_upload', Content_Type => 'form-data', - Content => [encode_utf8('♥')=>encode_utf8('♥♥'), file=>["$path", 'attachment.txt', 'Content-Type' =>'text/html; charset=UTF-8', ]]; + Content => [encode_utf8('♥')=>encode_utf8('♥♥'), file=>["$path", encode_utf8('♥ttachment.txt'), 'Content-Type' =>'text/html; charset=UTF-8', ]]; ok my $res = request $req; is decode_utf8($res->content), "

This is stream_body_fh action ♥

\n"; @@ -321,6 +350,28 @@ use Catalyst::Test 'MyApp'; is_deeply decode_json(($res->content)), +{'♥'=>'♥♥'}; } +{ + my $res = request "/root/manual_1"; + + is $res->code, 200, 'OK'; + is decode_utf8($res->content), "manual_1 ♥", 'correct body'; + is $res->content_length, 12, 'correct length'; + is $res->content_charset, 'UTF-8'; +} + +SKIP: { + eval { require Compress::Zlib; 1} || do { + skip "Compress::Zlib needed to test gzip encoding", 5 }; + + my $res = request "/root/gzipped"; + ok my $raw_content = $res->content; + ok my $content = Compress::Zlib::memGunzip($raw_content), 'no gunzip error'; + + is $res->code, 200, 'OK'; + is decode_utf8($content), "manual_1 ♥", 'correct body'; + is $res->content_charset, 'UTF-8'; +} + ## should we use binmode on filehandles to force the encoding...? ## Not sure what else to do with multipart here, if docs are enough...