local-lib5/lib/perl5/WWW/Mechanize.pm

   1 package WWW::Mechanize;
   2
   3 =head1 NAME
   4
   5 WWW::Mechanize - Handy web browsing in a Perl object
   6
   7 =head1 VERSION
   8
   9 Version 1.60
  10
  11 =cut
  12
  13 our $VERSION = '1.60';
  14
  15 =head1 SYNOPSIS
  16
  17 C<WWW::Mechanize>, or Mech for short, helps you automate interaction with
  18 a website. It supports performing a sequence of page fetches including
  19 following links and submitting forms. Each fetched page is parsed and
  20 its links and forms are extracted. A link or a form can be selected, form
  21 fields can be filled and the next page can be fetched. Mech also stores
  22 a history of the URLs you've visited, which can be queried and revisited.
  23
  24     use WWW::Mechanize;
  25     my $mech = WWW::Mechanize->new();
  26
  27     $mech->get( $url );
  28
  29     $mech->follow_link( n => 3 );
  30     $mech->follow_link( text_regex => qr/download this/i );
  31     $mech->follow_link( url => 'http://host.com/index.html' );
  32
  33     $mech->submit_form(
  34         form_number => 3,
  35         fields      => {
  36             username    => 'mungo',
  37             password    => 'lost-and-alone',
  38         }
  39     );
  40
  41     $mech->submit_form(
  42         form_name => 'search',
  43         fields    => { query  => 'pot of gold', },
  44         button    => 'Search Now'
  45     );
  46
  47
  48 Mech is well suited for use in testing web applications.  If you use
  49 one of the Test::*, like L<Test::HTML::Lint> modules, you can check the
  50 fetched content and use that as input to a test call.
  51
  52     use Test::More;
  53     like( $mech->content(), qr/$expected/, "Got expected content" );
  54
  55 Each page fetch stores its URL in a history stack which you can
  56 traverse.
  57
  58     $mech->back();
  59
  60 If you want finer control over your page fetching, you can use
  61 these methods. C<follow_link> and C<submit_form> are just high
  62 level wrappers around them.
  63
  64     $mech->find_link( n => $number );
  65     $mech->form_number( $number );
  66     $mech->form_name( $name );
  67     $mech->field( $name, $value );
  68     $mech->set_fields( %field_values );
  69     $mech->set_visible( @criteria );
  70     $mech->click( $button );
  71
  72 L<WWW::Mechanize> is a proper subclass of L<LWP::UserAgent> and
  73 you can also use any of L<LWP::UserAgent>'s methods.
  74
  75     $mech->add_header($name => $value);
  76
  77 Please note that Mech does NOT support JavaScript.  Please check the
  78 FAQ in WWW::Mechanize::FAQ for more.
  79
  80 =head1 IMPORTANT LINKS
  81
  82 =over 4
  83
  84 =item * L<http://code.google.com/p/www-mechanize/issues/list>
  85
  86 The queue for bugs & enhancements in WWW::Mechanize and
  87 Test::WWW::Mechanize.  Please note that the queue at L<http://rt.cpan.org>
  88 is no longer maintained.
  89
  90 =item * L<http://search.cpan.org/dist/WWW-Mechanize/>
  91
  92 The CPAN documentation page for Mechanize.
  93
  94 =item * L<http://search.cpan.org/dist/WWW-Mechanize/lib/WWW/Mechanize/FAQ.pod>
  95
  96 Frequently asked questions.  Make sure you read here FIRST.
  97
  98 =back
  99
 100 =cut
 101
 102 use strict;
 103 use warnings;
 104
 105 use HTTP::Request 1.30;
 106 use LWP::UserAgent 5.827;
 107 use HTML::Form 1.00;
 108 use HTML::TokeParser;
 109
 110 use base 'LWP::UserAgent';
 111
 112 our $HAS_ZLIB;
 113 BEGIN {
 114     $HAS_ZLIB = eval 'use Compress::Zlib (); 1;';
 115 }
 116
 117 =head1 CONSTRUCTOR AND STARTUP
 118
 119 =head2 new()
 120
 121 Creates and returns a new WWW::Mechanize object, hereafter referred to as
 122 the "agent".
 123
 124     my $mech = WWW::Mechanize->new()
 125
 126 The constructor for WWW::Mechanize overrides two of the parms to the
 127 LWP::UserAgent constructor:
 128
 129     agent => 'WWW-Mechanize/#.##'
 130     cookie_jar => {}    # an empty, memory-only HTTP::Cookies object
 131
 132 You can override these overrides by passing parms to the constructor,
 133 as in:
 134
 135     my $mech = WWW::Mechanize->new( agent => 'wonderbot 1.01' );
 136
 137 If you want none of the overhead of a cookie jar, or don't want your
 138 bot accepting cookies, you have to explicitly disallow it, like so:
 139
 140     my $mech = WWW::Mechanize->new( cookie_jar => undef );
 141
 142 Here are the parms that WWW::Mechanize recognizes.  These do not include
 143 parms that L<LWP::UserAgent> recognizes.
 144
 145 =over 4
 146
 147 =item * C<< autocheck => [0|1] >>
 148
 149 Checks each request made to see if it was successful.  This saves
 150 you the trouble of manually checking yourself.  Any errors found
 151 are errors, not warnings.
 152
 153 The default value is ON, unless it's being subclassed, in which
 154 case it is OFF.  This means that standalone L<WWW::Mechanize>instances
 155 have autocheck turned on, which is protective for the vast majority
 156 of Mech users who don't bother checking the return value of get()
 157 and post() and can't figure why their code fails. However, if
 158 L<WWW::Mechanize> is subclassed, such as for L<Test::WWW::Mechanize>
 159 or L<Test::WWW::Mechanize::Catalyst>, this may not be an appropriate
 160 default, so it's off.
 161
 162 =item * C<< noproxy => [0|1] >>
 163
 164 Turn off the automatic call to the L<LWP::UserAgent> C<env_proxy> function.
 165
 166 This needs to be explicitly turned off if you're using L<Crypt::SSLeay> to
 167 access a https site via a proxy server.  Note: you still need to set your
 168 HTTPS_PROXY environment variable as appropriate.
 169
 170 =item * C<< onwarn => \&func >>
 171
 172 Reference to a C<warn>-compatible function, such as C<< L<Carp>::carp >>,
 173 that is called when a warning needs to be shown.
 174
 175 If this is set to C<undef>, no warnings will ever be shown.  However,
 176 it's probably better to use the C<quiet> method to control that behavior.
 177
 178 If this value is not passed, Mech uses C<Carp::carp> if L<Carp> is
 179 installed, or C<CORE::warn> if not.
 180
 181 =item * C<< onerror => \&func >>
 182
 183 Reference to a C<die>-compatible function, such as C<< L<Carp>::croak >>,
 184 that is called when there's a fatal error.
 185
 186 If this is set to C<undef>, no errors will ever be shown.
 187
 188 If this value is not passed, Mech uses C<Carp::croak> if L<Carp> is
 189 installed, or C<CORE::die> if not.
 190
 191 =item * C<< quiet => [0|1] >>
 192
 193 Don't complain on warnings.  Setting C<< quiet => 1 >> is the same as
 194 calling C<< $mech->quiet(1) >>.  Default is off.
 195
 196 =item * C<< stack_depth => $value >>
 197
 198 Sets the depth of the page stack that keeps track of all the
 199 downloaded pages. Default is effectively infinite stack size.  If
 200 the stack is eating up your memory, then set this to a smaller
 201 number, say 5 or 10.  Setting this to zero means Mech will keep no
 202 history.
 203
 204 =back
 205
 206 To support forms, WWW::Mechanize's constructor pushes POST
 207 on to the agent's C<requests_redirectable> list (see also
 208 L<LWP::UserAgent>.)
 209
 210 =cut
 211
 212 sub new {
 213     my $class = shift;
 214
 215     my %parent_parms = (
 216         agent       => "WWW-Mechanize/$VERSION",
 217         cookie_jar  => {},
 218     );
 219
 220     my %mech_parms = (
 221         autocheck   => ($class eq 'WWW::Mechanize' ? 1 : 0),
 222         onwarn      => \&WWW::Mechanize::_warn,
 223         onerror     => \&WWW::Mechanize::_die,
 224         quiet       => 0,
 225         stack_depth => 8675309,     # Arbitrarily humongous stack
 226         headers     => {},
 227         noproxy     => 0,
 228     );
 229
 230     my %passed_parms = @_;
 231
 232     # Keep the mech-specific parms before creating the object.
 233     while ( my($key,$value) = each %passed_parms ) {
 234         if ( exists $mech_parms{$key} ) {
 235             $mech_parms{$key} = $value;
 236         }
 237         else {
 238             $parent_parms{$key} = $value;
 239         }
 240     }
 241
 242     my $self = $class->SUPER::new( %parent_parms );
 243     bless $self, $class;
 244
 245     # Use the mech parms now that we have a mech object.
 246     for my $parm ( keys %mech_parms ) {
 247         $self->{$parm} = $mech_parms{$parm};
 248     }
 249     $self->{page_stack} = [];
 250     $self->env_proxy() unless $mech_parms{noproxy};
 251
 252     # libwww-perl 5.800 (and before, I assume) has a problem where
 253     # $ua->{proxy} can be undef and clone() doesn't handle it.
 254     $self->{proxy} = {} unless defined $self->{proxy};
 255     push( @{$self->requests_redirectable}, 'POST' );
 256
 257     $self->_reset_page;
 258
 259     return $self;
 260 }
 261
 262 =head2 $mech->agent_alias( $alias )
 263
 264 Sets the user agent string to the expanded version from a table of actual user strings.
 265 I<$alias> can be one of the following:
 266
 267 =over 4
 268
 269 =item * Windows IE 6
 270
 271 =item * Windows Mozilla
 272
 273 =item * Mac Safari
 274
 275 =item * Mac Mozilla
 276
 277 =item * Linux Mozilla
 278
 279 =item * Linux Konqueror
 280
 281 =back
 282
 283 then it will be replaced with a more interesting one.  For instance,
 284
 285     $mech->agent_alias( 'Windows IE 6' );
 286
 287 sets your User-Agent to
 288
 289     Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
 290
 291 The list of valid aliases can be returned from C<known_agent_aliases()>.  The current list is:
 292
 293 =over
 294
 295 =item * Windows IE 6
 296
 297 =item * Windows Mozilla
 298
 299 =item * Mac Safari
 300
 301 =item * Mac Mozilla
 302
 303 =item * Linux Mozilla
 304
 305 =item * Linux Konqueror
 306
 307 =back
 308
 309 =cut
 310
 311 my %known_agents = (
 312     'Windows IE 6'      => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
 313     'Windows Mozilla'   => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
 314     'Mac Safari'        => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/85 (KHTML, like Gecko) Safari/85',
 315     'Mac Mozilla'       => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
 316     'Linux Mozilla'     => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
 317     'Linux Konqueror'   => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
 318 );
 319
 320 sub agent_alias {
 321     my $self = shift;
 322     my $alias = shift;
 323
 324     if ( defined $known_agents{$alias} ) {
 325         return $self->agent( $known_agents{$alias} );
 326     }
 327     else {
 328         $self->warn( qq{Unknown agent alias "$alias"} );
 329         return $self->agent();
 330     }
 331 }
 332
 333 =head2 known_agent_aliases()
 334
 335 Returns a list of all the agent aliases that Mech knows about.
 336
 337 =cut
 338
 339 sub known_agent_aliases {
 340     return sort keys %known_agents;
 341 }
 342
 343 =head1 PAGE-FETCHING METHODS
 344
 345 =head2 $mech->get( $uri )
 346
 347 Given a URL/URI, fetches it.  Returns an L<HTTP::Response> object.
 348 I<$uri> can be a well-formed URL string, a L<URI> object, or a
 349 L<WWW::Mechanize::Link> object.
 350
 351 The results are stored internally in the agent object, but you don't
 352 know that.  Just use the accessors listed below.  Poking at the
 353 internals is deprecated and subject to change in the future.
 354
 355 C<get()> is a well-behaved overloaded version of the method in
 356 L<LWP::UserAgent>.  This lets you do things like
 357
 358     $mech->get( $uri, ':content_file' => $tempfile );
 359
 360 and you can rest assured that the parms will get filtered down
 361 appropriately.
 362
 363 B<NOTE:> Because C<:content_file> causes the page contents to be
 364 stored in a file instead of the response object, some Mech functions
 365 that expect it to be there won't work as expected. Use with caution.
 366
 367 =cut
 368
 369 sub get {
 370     my $self = shift;
 371     my $uri = shift;
 372
 373     $uri = $uri->url if ref($uri) eq 'WWW::Mechanize::Link';
 374
 375     $uri = $self->base
 376             ? URI->new_abs( $uri, $self->base )
 377             : URI->new( $uri );
 378
 379     # It appears we are returning a super-class method,
 380     # but it in turn calls the request() method here in Mechanize
 381     return $self->SUPER::get( $uri->as_string, @_ );
 382 }
 383
 384 =head2 $mech->put( $uri, content => $content )
 385
 386 PUTs I<$content> to $uri.  Returns an L<HTTP::Response> object.
 387 I<$uri> can be a well-formed URI string, a L<URI> object, or a
 388 L<WWW::Mechanize::Link> object.
 389
 390 =cut
 391
 392 sub put {
 393     my $self = shift;
 394     my $uri = shift;
 395
 396     $uri = $uri->url if ref($uri) eq 'WWW::Mechanize::Link';
 397
 398     $uri = $self->base
 399             ? URI->new_abs( $uri, $self->base )
 400             : URI->new( $uri );
 401
 402     # It appears we are returning a super-class method,
 403     # but it in turn calls the request() method here in Mechanize
 404     return $self->_SUPER_put( $uri->as_string, @_ );
 405 }
 406
 407
 408 # Added until LWP::UserAgent has it.
 409 sub _SUPER_put {
 410     require HTTP::Request::Common;
 411     my($self, @parameters) = @_;
 412     my @suff = $self->_process_colonic_headers(\@parameters,1);
 413     return $self->request( HTTP::Request::Common::PUT( @parameters ), @suff );
 414 }
 415
 416 =head2 $mech->reload()
 417
 418 Acts like the reload button in a browser: repeats the current
 419 request. The history (as per the L</back> method) is not altered.
 420
 421 Returns the L<HTTP::Response> object from the reload, or C<undef>
 422 if there's no current request.
 423
 424 =cut
 425
 426 sub reload {
 427     my $self = shift;
 428
 429     return unless my $req = $self->{req};
 430
 431     return $self->_update_page( $req, $self->_make_request( $req, @_ ) );
 432 }
 433
 434 =head2 $mech->back()
 435
 436 The equivalent of hitting the "back" button in a browser.  Returns to
 437 the previous page.  Won't go back past the first page. (Really, what
 438 would it do if it could?)
 439
 440 Returns true if it could go back, or false if not.
 441
 442 =cut
 443
 444 sub back {
 445     my $self = shift;
 446
 447     my $stack = $self->{page_stack};
 448     return unless $stack && @{$stack};
 449
 450     my $popped = pop @{$self->{page_stack}};
 451     my $req    = $popped->{req};
 452     my $res    = $popped->{res};
 453
 454     $self->_update_page( $req, $res );
 455
 456     return 1;
 457 }
 458
 459 =head1 STATUS METHODS
 460
 461 =head2 $mech->success()
 462
 463 Returns a boolean telling whether the last request was successful.
 464 If there hasn't been an operation yet, returns false.
 465
 466 This is a convenience function that wraps C<< $mech->res->is_success >>.
 467
 468 =cut
 469
 470 sub success {
 471     my $self = shift;
 472
 473     return $self->res && $self->res->is_success;
 474 }
 475
 476
 477 =head2 $mech->uri()
 478
 479 Returns the current URI as a L<URI> object. This object stringifies
 480 to the URI itself.
 481
 482 =head2 $mech->response() / $mech->res()
 483
 484 Return the current response as an L<HTTP::Response> object.
 485
 486 Synonym for C<< $mech->response() >>
 487
 488 =head2 $mech->status()
 489
 490 Returns the HTTP status code of the response.  This is a 3-digit
 491 number like 200 for OK, 404 for not found, and so on.
 492
 493 =head2 $mech->ct() / $mech->content_type()
 494
 495 Returns the content type of the response.
 496
 497 =head2 $mech->base()
 498
 499 Returns the base URI for the current response
 500
 501 =head2 $mech->forms()
 502
 503 When called in a list context, returns a list of the forms found in
 504 the last fetched page. In a scalar context, returns a reference to
 505 an array with those forms. The forms returned are all L<HTML::Form>
 506 objects.
 507
 508 =head2 $mech->current_form()
 509
 510 Returns the current form as an L<HTML::Form> object.
 511
 512 =head2 $mech->links()
 513
 514 When called in a list context, returns a list of the links found in the
 515 last fetched page.  In a scalar context it returns a reference to an array
 516 with those links.  Each link is a L<WWW::Mechanize::Link> object.
 517
 518 =head2 $mech->is_html()
 519
 520 Returns true/false on whether our content is HTML, according to the
 521 HTTP headers.
 522
 523 =cut
 524
 525 sub uri {
 526     my $self = shift;
 527     return $self->response->request->uri;
 528 }
 529
 530 sub res {           my $self = shift; return $self->{res}; }
 531 sub response {      my $self = shift; return $self->{res}; }
 532 sub status {        my $self = shift; return $self->{status}; }
 533 sub ct {            my $self = shift; return $self->{ct}; }
 534 sub content_type {  my $self = shift; return $self->{ct}; }
 535 sub base {          my $self = shift; return $self->{base}; }
 536 sub current_form {  my $self = shift; return $self->{form}; }
 537 sub is_html {       my $self = shift; return defined $self->ct && ($self->ct eq 'text/html'); }
 538
 539 =head2 $mech->title()
 540
 541 Returns the contents of the C<< <TITLE> >> tag, as parsed by
 542 L<HTML::HeadParser>.  Returns undef if the content is not HTML.
 543
 544 =cut
 545
 546 sub title {
 547     my $self = shift;
 548     return unless $self->is_html;
 549
 550     require HTML::HeadParser;
 551     my $p = HTML::HeadParser->new;
 552     $p->parse($self->content);
 553     return $p->header('Title');
 554 }
 555
 556 =head1 CONTENT-HANDLING METHODS
 557
 558 =head2 $mech->content(...)
 559
 560 Returns the content that the mech uses internally for the last page
 561 fetched. Ordinarily this is the same as $mech->response()->content(),
 562 but this may differ for HTML documents if L</update_html> is
 563 overloaded (in which case the value passed to the base-class
 564 implementation of same will be returned), and/or extra named arguments
 565 are passed to I<content()>:
 566
 567 =over 2
 568
 569 =item I<< $mech->content( format => 'text' ) >>
 570
 571 Returns a text-only version of the page, with all HTML markup
 572 stripped. This feature requires I<HTML::TreeBuilder> to be installed,
 573 or a fatal error will be thrown.
 574
 575 =item I<< $mech->content( base_href => [$base_href|undef] ) >>
 576
 577 Returns the HTML document, modified to contain a
 578 C<< <base href="$base_href"> >> mark-up in the header.
 579 I<$base_href> is C<< $mech->base() >> if not specified. This is
 580 handy to pass the HTML to e.g. L<HTML::Display>.
 581
 582 =back
 583
 584 Passing arguments to C<content()> if the current document is not
 585 HTML has no effect now (i.e. the return value is the same as
 586 C<< $self->response()->content() >>. This may change in the future,
 587 but will likely be backwards-compatible when it does.
 588
 589 =cut
 590
 591 sub content {
 592     my $self = shift;
 593     my $content = $self->{content};
 594
 595     if ( $self->is_html ) {
 596         my %parms = @_;
 597
 598         if ( exists $parms{base_href} ) {
 599             my $base_href = (delete $parms{base_href}) || $self->base;
 600             $content=~s/<head>/<head>\n<base href="$base_href">/i;
 601         }
 602
 603         if ( my $format = delete $parms{format} ) {
 604             $content = $self->_format_content( $format, $content );
 605         }
 606
 607         $self->_check_unhandled_parms( %parms );
 608     }
 609
 610     return $content;
 611 }
 612
 613 sub _format_content {
 614     my $self = shift;
 615     my $format = shift;
 616     my $content = shift;
 617
 618     if ( $format eq 'text' ) {
 619         return $self->_content_as_text($content);
 620     }
 621     else {
 622         $self->die( qq{Unknown "format" parameter "$format"} );
 623     }
 624 }
 625
 626 sub _content_as_text {
 627     my $self = shift;
 628     my $content = shift;
 629
 630     require HTML::TreeBuilder;
 631     my $tree = HTML::TreeBuilder->new();
 632     $tree->parse($content);
 633     $tree->eof();
 634     $tree->elementify(); # just for safety
 635     my $formatted_content = $tree->as_text();
 636     $tree->delete;
 637
 638     return $formatted_content;
 639 }
 640
 641 sub _check_unhandled_parms {
 642     my $self  = shift;
 643     my %parms = @_;
 644
 645     for my $cmd ( sort keys %parms ) {
 646         $self->die( qq{Unknown named argument "$cmd"} );
 647     }
 648 }
 649
 650 =head1 LINK METHODS
 651
 652 =head2 $mech->links()
 653
 654 Lists all the links on the current page.  Each link is a
 655 WWW::Mechanize::Link object. In list context, returns a list of all
 656 links.  In scalar context, returns an array reference of all links.
 657
 658 =cut
 659
 660 sub links {
 661     my $self = shift;
 662
 663     $self->_extract_links() unless $self->{_extracted_links};
 664
 665     return @{$self->{links}} if wantarray;
 666     return $self->{links};
 667 }
 668
 669 =head2 $mech->follow_link(...)
 670
 671 Follows a specified link on the page.  You specify the match to be
 672 found using the same parms that C<L<find_link()>> uses.
 673
 674 Here some examples:
 675
 676 =over 4
 677
 678 =item * 3rd link called "download"
 679
 680     $mech->follow_link( text => 'download', n => 3 );
 681
 682 =item * first link where the URL has "download" in it, regardless of case:
 683
 684     $mech->follow_link( url_regex => qr/download/i );
 685
 686 or
 687
 688     $mech->follow_link( url_regex => qr/(?i:download)/ );
 689
 690 =item * 3rd link on the page
 691
 692     $mech->follow_link( n => 3 );
 693
 694 =back
 695
 696 Returns the result of the GET method (an HTTP::Response object) if
 697 a link was found. If the page has no links, or the specified link
 698 couldn't be found, returns undef.
 699
 700 =cut
 701
 702 sub follow_link {
 703     my $self = shift;
 704     my %parms = ( n=>1, @_ );
 705
 706     if ( $parms{n} eq 'all' ) {
 707         delete $parms{n};
 708         $self->warn( q{follow_link(n=>"all") is not valid} );
 709     }
 710
 711     my $link = $self->find_link(%parms);
 712     return $self->get( $link->url ) if $link;
 713     $self->die( 'Link not found: ', $link->url ) if $self->{autocheck};
 714     return;
 715 }
 716
 717 =head2 $mech->find_link( ... )
 718
 719 Finds a link in the currently fetched page. It returns a
 720 L<WWW::Mechanize::Link> object which describes the link.  (You'll
 721 probably be most interested in the C<url()> property.)  If it fails
 722 to find a link it returns undef.
 723
 724 You can take the URL part and pass it to the C<get()> method.  If
 725 that's your plan, you might as well use the C<follow_link()> method
 726 directly, since it does the C<get()> for you automatically.
 727
 728 Note that C<< <FRAME SRC="..."> >> tags are parsed out of the the HTML
 729 and treated as links so this method works with them.
 730
 731 You can select which link to find by passing in one or more of these
 732 key/value pairs:
 733
 734 =over 4
 735
 736 =item * C<< text => 'string', >> and C<< text_regex => qr/regex/, >>
 737
 738 C<text> matches the text of the link against I<string>, which must be an
 739 exact match.  To select a link with text that is exactly "download", use
 740
 741     $mech->find_link( text => 'download' );
 742
 743 C<text_regex> matches the text of the link against I<regex>.  To select a
 744 link with text that has "download" anywhere in it, regardless of case, use
 745
 746     $mech->find_link( text_regex => qr/download/i );
 747
 748 Note that the text extracted from the page's links are trimmed.  For
 749 example, C<< <a> foo </a> >> is stored as 'foo', and searching for
 750 leading or trailing spaces will fail.
 751
 752 =item * C<< url => 'string', >> and C<< url_regex => qr/regex/, >>
 753
 754 Matches the URL of the link against I<string> or I<regex>, as appropriate.
 755 The URL may be a relative URL, like F<foo/bar.html>, depending on how
 756 it's coded on the page.
 757
 758 =item * C<< url_abs => string >> and C<< url_abs_regex => regex >>
 759
 760 Matches the absolute URL of the link against I<string> or I<regex>,
 761 as appropriate.  The URL will be an absolute URL, even if it's relative
 762 in the page.
 763
 764 =item * C<< name => string >> and C<< name_regex => regex >>
 765
 766 Matches the name of the link against I<string> or I<regex>, as appropriate.
 767
 768 =item * C<< id => string >> and C<< id_regex => regex >>
 769
 770 Matches the attribute 'id' of the link against I<string> or
 771 I<regex>, as appropriate.
 772
 773 =item * C<< class => string >> and C<< class_regex => regex >>
 774
 775 Matches the attribute 'class' of the link against I<string> or
 776 I<regex>, as appropriate.
 777
 778 =item * C<< tag => string >> and C<< tag_regex => regex >>
 779
 780 Matches the tag that the link came from against I<string> or I<regex>,
 781 as appropriate.  The C<tag_regex> is probably most useful to check for
 782 more than one tag, as in:
 783
 784     $mech->find_link( tag_regex => qr/^(a|frame)$/ );
 785
 786 The tags and attributes looked at are defined below, at
 787 L<< $mech->find_link() : link format >>.
 788
 789 =back
 790
 791 If C<n> is not specified, it defaults to 1.  Therefore, if you don't
 792 specify any parms, this method defaults to finding the first link on the
 793 page.
 794
 795 Note that you can specify multiple text or URL parameters, which
 796 will be ANDed together.  For example, to find the first link with
 797 text of "News" and with "cnn.com" in the URL, use:
 798
 799     $mech->find_link( text => 'News', url_regex => qr/cnn\.com/ );
 800
 801 The return value is a reference to an array containing a
 802 L<WWW::Mechanize::Link> object for every link in C<< $self->content >>.
 803
 804 The links come from the following:
 805
 806 =over 4
 807
 808 =item C<< <a href=...> >>
 809
 810 =item C<< <area href=...> >>
 811
 812 =item C<< <frame src=...> >>
 813
 814 =item C<< <iframe src=...> >>
 815
 816 =item C<< <link href=...> >>
 817
 818 =item C<< <meta content=...> >>
 819
 820 =back
 821
 822 =cut
 823
 824 sub find_link {
 825     my $self = shift;
 826     my %parms = ( n=>1, @_ );
 827
 828     my $wantall = ( $parms{n} eq 'all' );
 829
 830     $self->_clean_keys( \%parms, qr/^(n|(text|url|url_abs|name|tag|id|class)(_regex)?)$/ );
 831
 832     my @links = $self->links or return;
 833
 834     my $nmatches = 0;
 835     my @matches;
 836     for my $link ( @links ) {
 837         if ( _match_any_link_parms($link,\%parms) ) {
 838             if ( $wantall ) {
 839                 push( @matches, $link );
 840             }
 841             else {
 842                 ++$nmatches;
 843                 return $link if $nmatches >= $parms{n};
 844             }
 845         }
 846     } # for @links
 847
 848     if ( $wantall ) {
 849         return @matches if wantarray;
 850         return \@matches;
 851     }
 852
 853     return;
 854 } # find_link
 855
 856 # Used by find_links to check for matches
 857 # The logic is such that ALL parm criteria that are given must match
 858 sub _match_any_link_parms {
 859     my $link = shift;
 860     my $p = shift;
 861
 862     # No conditions, anything matches
 863     return 1 unless keys %$p;
 864
 865     return if defined $p->{url}           && !($link->url eq $p->{url} );
 866     return if defined $p->{url_regex}     && !($link->url =~ $p->{url_regex} );
 867     return if defined $p->{url_abs}       && !($link->url_abs eq $p->{url_abs} );
 868     return if defined $p->{url_abs_regex} && !($link->url_abs =~ $p->{url_abs_regex} );
 869     return if defined $p->{text}          && !(defined($link->text) && $link->text eq $p->{text} );
 870     return if defined $p->{text_regex}    && !(defined($link->text) && $link->text =~ $p->{text_regex} );
 871     return if defined $p->{name}          && !(defined($link->name) && $link->name eq $p->{name} );
 872     return if defined $p->{name_regex}    && !(defined($link->name) && $link->name =~ $p->{name_regex} );
 873     return if defined $p->{tag}           && !($link->tag && $link->tag eq $p->{tag} );
 874     return if defined $p->{tag_regex}     && !($link->tag && $link->tag =~ $p->{tag_regex} );
 875
 876     return if defined $p->{id}            && !($link->attrs->{id} && $link->attrs->{id} eq $p->{id} );
 877     return if defined $p->{id_regex}      && !($link->attrs->{id} && $link->attrs->{id} =~ $p->{id_regex} );
 878     return if defined $p->{class}         && !($link->attrs->{class} && $link->attrs->{class} eq $p->{class} );
 879     return if defined $p->{class_regex}   && !($link->attrs->{class} && $link->attrs->{class} =~ $p->{class_regex} );
 880
 881     # Success: everything that was defined passed.
 882     return 1;
 883
 884 }
 885
 886 # Cleans the %parms parameter for the find_link and find_image methods.
 887 sub _clean_keys {
 888     my $self = shift;
 889     my $parms = shift;
 890     my $rx_keyname = shift;
 891
 892     for my $key ( keys %$parms ) {
 893         my $val = $parms->{$key};
 894         if ( $key !~ qr/$rx_keyname/ ) {
 895             $self->warn( qq{Unknown link-finding parameter "$key"} );
 896             delete $parms->{$key};
 897             next;
 898         }
 899
 900         my $key_regex = ( $key =~ /_regex$/ );
 901         my $val_regex = ( ref($val) eq 'Regexp' );
 902
 903         if ( $key_regex ) {
 904             if ( !$val_regex ) {
 905                 $self->warn( qq{$val passed as $key is not a regex} );
 906                 delete $parms->{$key};
 907                 next;
 908             }
 909         }
 910         else {
 911             if ( $val_regex ) {
 912                 $self->warn( qq{$val passed as '$key' is a regex} );
 913                 delete $parms->{$key};
 914                 next;
 915             }
 916             if ( $val =~ /^\s|\s$/ ) {
 917                 $self->warn( qq{'$val' is space-padded and cannot succeed} );
 918                 delete $parms->{$key};
 919                 next;
 920             }
 921         }
 922     } # for keys %parms
 923 } # _clean_keys()
 924
 925
 926 =head2 $mech->find_all_links( ... )
 927
 928 Returns all the links on the current page that match the criteria.  The
 929 method for specifying link criteria is the same as in C<L</find_link()>>.
 930 Each of the links returned is a L<WWW::Mechanize::Link> object.
 931
 932 In list context, C<find_all_links()> returns a list of the links.
 933 Otherwise, it returns a reference to the list of links.
 934
 935 C<find_all_links()> with no parameters returns all links in the
 936 page.
 937
 938 =cut
 939
 940 sub find_all_links {
 941     my $self = shift;
 942     return $self->find_link( @_, n=>'all' );
 943 }
 944
 945 =head2 $mech->find_all_inputs( ... criteria ... )
 946
 947 find_all_inputs() returns an array of all the input controls in the
 948 current form whose properties match all of the regexes passed in.
 949 The controls returned are all descended from HTML::Form::Input.
 950
 951 If no criteria are passed, all inputs will be returned.
 952
 953 If there is no current page, there is no form on the current
 954 page, or there are no submit controls in the current form
 955 then the return will be an empty array.
 956
 957 You may use a regex or a literal string:
 958
 959     # get all textarea controls whose names begin with "customer"
 960     my @customer_text_inputs = $mech->find_all_inputs(
 961         type       => 'textarea',
 962         name_regex => qr/^customer/,
 963     );
 964
 965     # get all text or textarea controls called "customer"
 966     my @customer_text_inputs = $mech->find_all_inputs(
 967         type_regex => qr/^(text|textarea)$/,
 968         name       => 'customer',
 969     );
 970
 971 =cut
 972
 973 sub find_all_inputs {
 974     my $self = shift;
 975     my %criteria = @_;
 976
 977     my $form = $self->current_form() or return;
 978
 979     my @found;
 980     foreach my $input ( $form->inputs ) { # check every pattern for a match on the current hash
 981         my $matched = 1;
 982         foreach my $criterion ( sort keys %criteria ) { # Sort so we're deterministic
 983             my $field = $criterion;
 984             my $is_regex = ( $field =~ s/(?:_regex)$// );
 985             my $what = $input->{$field};
 986             $matched = defined($what) && (
 987                 $is_regex
 988                     ? ( $what =~ $criteria{$criterion} )
 989                     : ( $what eq $criteria{$criterion} )
 990                 );
 991             last if !$matched;
 992         }
 993         push @found, $input if $matched;
 994     }
 995     return @found;
 996 }
 997
 998 =head2 $mech->find_all_submits( ... criteria ... )
 999
1000 C<find_all_submits()> does the same thing as C<find_all_inputs()>
1001 except that it only returns controls that are submit controls,
1002 ignoring other types of input controls like text and checkboxes.
1003
1004 =cut
1005
1006 sub find_all_submits {
1007     my $self = shift;
1008
1009     return $self->find_all_inputs( @_, type_regex => qr/^(submit|image)$/ );
1010 }
1011
1012
1013 =head1 IMAGE METHODS
1014
1015 =head2 $mech->images
1016
1017 Lists all the images on the current page.  Each image is a
1018 WWW::Mechanize::Image object. In list context, returns a list of all
1019 images.  In scalar context, returns an array reference of all images.
1020
1021 =cut
1022
1023 sub images {
1024     my $self = shift;
1025
1026     $self->_extract_images() unless $self->{_extracted_images};
1027
1028     return @{$self->{images}} if wantarray;
1029     return $self->{images};
1030 }
1031
1032 =head2 $mech->find_image()
1033
1034 Finds an image in the current page. It returns a
1035 L<WWW::Mechanize::Image> object which describes the image.  If it fails
1036 to find an image it returns undef.
1037
1038 You can select which image to find by passing in one or more of these
1039 key/value pairs:
1040
1041 =over 4
1042
1043 =item * C<< alt => 'string' >> and C<< alt_regex => qr/regex/, >>
1044
1045 C<alt> matches the ALT attribute of the image against I<string>, which must be an
1046 exact match. To select a image with an ALT tag that is exactly "download", use
1047
1048     $mech->find_image( alt => 'download' );
1049
1050 C<alt_regex> matches the ALT attribute of the image  against a regular
1051 expression.  To select an image with an ALT attribute that has "download"
1052 anywhere in it, regardless of case, use
1053
1054     $mech->find_image( alt_regex => qr/download/i );
1055
1056 =item * C<< url => 'string', >> and C<< url_regex => qr/regex/, >>
1057
1058 Matches the URL of the image against I<string> or I<regex>, as appropriate.
1059 The URL may be a relative URL, like F<foo/bar.html>, depending on how
1060 it's coded on the page.
1061
1062 =item * C<< url_abs => string >> and C<< url_abs_regex => regex >>
1063
1064 Matches the absolute URL of the image against I<string> or I<regex>,
1065 as appropriate.  The URL will be an absolute URL, even if it's relative
1066 in the page.
1067
1068 =item * C<< tag => string >> and C<< tag_regex => regex >>
1069
1070 Matches the tag that the image came from against I<string> or I<regex>,
1071 as appropriate.  The C<tag_regex> is probably most useful to check for
1072 more than one tag, as in:
1073
1074     $mech->find_image( tag_regex => qr/^(img|input)$/ );
1075
1076 The tags supported are C<< <img> >> and C<< <input> >>.
1077
1078 =back
1079
1080 If C<n> is not specified, it defaults to 1.  Therefore, if you don't
1081 specify any parms, this method defaults to finding the first image on the
1082 page.
1083
1084 Note that you can specify multiple ALT or URL parameters, which
1085 will be ANDed together.  For example, to find the first image with
1086 ALT text of "News" and with "cnn.com" in the URL, use:
1087
1088     $mech->find_image( image => 'News', url_regex => qr/cnn\.com/ );
1089
1090 The return value is a reference to an array containing a
1091 L<WWW::Mechanize::Image> object for every image in C<< $self->content >>.
1092
1093 =cut
1094
1095 sub find_image {
1096     my $self = shift;
1097     my %parms = ( n=>1, @_ );
1098
1099     my $wantall = ( $parms{n} eq 'all' );
1100
1101     $self->_clean_keys( \%parms, qr/^(n|(alt|url|url_abs|tag)(_regex)?)$/ );
1102
1103     my @images = $self->images or return;
1104
1105     my $nmatches = 0;
1106     my @matches;
1107     for my $image ( @images ) {
1108         if ( _match_any_image_parms($image,\%parms) ) {
1109             if ( $wantall ) {
1110                 push( @matches, $image );
1111             }
1112             else {
1113                 ++$nmatches;
1114                 return $image if $nmatches >= $parms{n};
1115             }
1116         }
1117     } # for @images
1118
1119     if ( $wantall ) {
1120         return @matches if wantarray;
1121         return \@matches;
1122     }
1123
1124     return;
1125 }
1126
1127 # Used by find_images to check for matches
1128 # The logic is such that ALL parm criteria that are given must match
1129 sub _match_any_image_parms {
1130     my $image = shift;
1131     my $p = shift;
1132
1133     # No conditions, anything matches
1134     return 1 unless keys %$p;
1135
1136     return if defined $p->{url}           && !($image->url eq $p->{url} );
1137     return if defined $p->{url_regex}     && !($image->url =~ $p->{url_regex} );
1138     return if defined $p->{url_abs}       && !($image->url_abs eq $p->{url_abs} );
1139     return if defined $p->{url_abs_regex} && !($image->url_abs =~ $p->{url_abs_regex} );
1140     return if defined $p->{alt}           && !(defined($image->alt) && $image->alt eq $p->{alt} );
1141     return if defined $p->{alt_regex}     && !(defined($image->alt) && $image->alt =~ $p->{alt_regex} );
1142     return if defined $p->{tag}           && !($image->tag && $image->tag eq $p->{tag} );
1143     return if defined $p->{tag_regex}     && !($image->tag && $image->tag =~ $p->{tag_regex} );
1144
1145     # Success: everything that was defined passed.
1146     return 1;
1147 }
1148
1149
1150 =head2 $mech->find_all_images( ... )
1151
1152 Returns all the images on the current page that match the criteria.  The
1153 method for specifying image criteria is the same as in C<L</find_image()>>.
1154 Each of the images returned is a L<WWW::Mechanize::Image> object.
1155
1156 In list context, C<find_all_images()> returns a list of the images.
1157 Otherwise, it returns a reference to the list of images.
1158
1159 C<find_all_images()> with no parameters returns all images in the page.
1160
1161 =cut
1162
1163 sub find_all_images {
1164     my $self = shift;
1165     return $self->find_image( @_, n=>'all' );
1166 }
1167
1168 =head1 FORM METHODS
1169
1170 These methods let you work with the forms on a page.  The idea is
1171 to choose a form that you'll later work with using the field methods
1172 below.
1173
1174 =head2 $mech->forms
1175
1176 Lists all the forms on the current page.  Each form is an L<HTML::Form>
1177 object.  In list context, returns a list of all forms.  In scalar
1178 context, returns an array reference of all forms.
1179
1180 =cut
1181
1182 sub forms {
1183     my $self = shift;
1184     return @{$self->{forms}} if wantarray;
1185     return $self->{forms};
1186 }
1187
1188
1189 =head2 $mech->form_number($number)
1190
1191 Selects the I<number>th form on the page as the target for subsequent
1192 calls to C<L</field()>> and C<L</click()>>.  Also returns the form that was
1193 selected.
1194
1195 If it is found, the form is returned as an L<HTML::Form> object and set internally
1196 for later use with Mech's form methods such as C<L</field()>> and C<L</click()>>.
1197
1198 Emits a warning and returns undef if no form is found.
1199
1200 The first form is number 1, not zero.
1201
1202 =cut
1203
1204 sub form_number {
1205     my ($self, $form) = @_;
1206     # XXX Should we die if no $form is defined? Same question for form_name()
1207
1208     if ($self->{forms}->[$form-1]) {
1209         $self->{form} = $self->{forms}->[$form-1];
1210         return $self->{form};
1211     }
1212     else {
1213         $self->warn( "There is no form numbered $form" );
1214         return undef;
1215     }
1216 }
1217
1218 =head2 $mech->form_name( $name )
1219
1220 Selects a form by name.  If there is more than one form on the page
1221 with that name, then the first one is used, and a warning is
1222 generated.
1223
1224 If it is found, the form is returned as an L<HTML::Form> object and
1225 set internally for later use with Mech's form methods such as
1226 C<L</field()>> and C<L</click()>>.
1227
1228 Returns undef if no form is found.
1229
1230 =cut
1231
1232 sub form_name {
1233     my ($self, $form) = @_;
1234
1235     my $temp;
1236     my @matches = grep {defined($temp = $_->attr('name')) and ($temp eq $form) } $self->forms;
1237     if ( my $nmatches = @matches ) {
1238         $self->warn( "There are $nmatches forms named $form.  The first one was used." )
1239             if $nmatches > 1;
1240         return $self->{form} = $matches[0];
1241     }
1242     else {
1243         $self->warn( qq{ There is no form named "$form"} );
1244         return undef;
1245     }
1246 }
1247
1248 =head2 $mech->form_id( $name )
1249
1250 Selects a form by ID.  If there is more than one form on the page
1251 with that ID, then the first one is used, and a warning is generated.
1252
1253 If it is found, the form is returned as an L<HTML::Form> object and
1254 set internally for later use with Mech's form methods such as
1255 C<L</field()>> and C<L</click()>>.
1256
1257 Returns undef if no form is found.
1258
1259 =cut
1260
1261 sub form_id {
1262     my ($self, $formid) = @_;
1263
1264     my $temp;
1265     my @matches = grep { defined($temp = $_->attr('id')) and ($temp eq $formid) } $self->forms;
1266     if ( @matches ) {
1267         $self->warn( 'There are ', scalar @matches, " forms with ID $formid.  The first one was used." )
1268             if @matches > 1;
1269         return $self->{form} = $matches[0];
1270     }
1271     else {
1272         $self->warn( qq{ There is no form with ID "$formid"} );
1273         return undef;
1274     }
1275 }
1276
1277
1278 =head2 $mech->form_with_fields( @fields )
1279
1280 Selects a form by passing in a list of field names it must contain.  If there
1281 is more than one form on the page with that matches, then the first one is used,
1282 and a warning is generated.
1283
1284 If it is found, the form is returned as an L<HTML::Form> object and set internally
1285 for later used with Mech's form methods such as C<L</field()>> and C<L</click()>>.
1286
1287 Returns undef if no form is found.
1288
1289 Note that this functionality requires libwww-perl 5.69 or higher.
1290
1291 =cut
1292
1293 sub form_with_fields {
1294     my ($self, @fields) = @_;
1295     die 'no fields provided' unless scalar @fields;
1296
1297     my @matches;
1298     FORMS: for my $form (@{ $self->forms }) {
1299         my @fields_in_form = $form->param();
1300         for my $field (@fields) {
1301             next FORMS unless grep { $_ eq $field } @fields_in_form;
1302         }
1303         push @matches, $form;
1304     }
1305
1306     if ( my $nmatches = @matches ) {
1307         $self->warn( "There are $nmatches forms with the named fields.  The first one was used." )
1308             if $nmatches > 1;
1309         return $self->{form} = $matches[0];
1310     }
1311     else {
1312         $self->warn( qq{There is no form with the requested fields} );
1313         return undef;
1314     }
1315 }
1316
1317 =head1 FIELD METHODS
1318
1319 These methods allow you to set the values of fields in a given form.
1320
1321 =head2 $mech->field( $name, $value, $number )
1322
1323 =head2 $mech->field( $name, \@values, $number )
1324
1325 Given the name of a field, set its value to the value specified.
1326 This applies to the current form (as set by the L</form_name()> or
1327 L</form_number()> method or defaulting to the first form on the
1328 page).
1329
1330 The optional I<$number> parameter is used to distinguish between two fields
1331 with the same name.  The fields are numbered from 1.
1332
1333 =cut
1334
1335 sub field {
1336     my ($self, $name, $value, $number) = @_;
1337     $number ||= 1;
1338
1339     my $form = $self->{form};
1340     if ($number > 1) {
1341         $form->find_input($name, undef, $number)->value($value);
1342     }
1343     else {
1344         if ( ref($value) eq 'ARRAY' ) {
1345             $form->param($name, $value);
1346         }
1347         else {
1348             $form->value($name => $value);
1349         }
1350     }
1351 }
1352
1353 =head2 $mech->select($name, $value)
1354
1355 =head2 $mech->select($name, \@values)
1356
1357 Given the name of a C<select> field, set its value to the value
1358 specified.  If the field is not C<< <select multiple> >> and the
1359 C<$value> is an array, only the B<first> value will be set.  [Note:
1360 the documentation previously claimed that only the last value would
1361 be set, but this was incorrect.]  Passing C<$value> as a hash with
1362 an C<n> key selects an item by number (e.g.
1363 C<< {n => 3} >> or C<< {n => [2,4]} >>).
1364 The numbering starts at 1.  This applies to the current form.
1365
1366 Returns true on successfully setting the value. On failure, returns
1367 false and calls C<< $self>warn() >> with an error message.
1368
1369 =cut
1370
1371 sub select {
1372     my ($self, $name, $value) = @_;
1373
1374     my $form = $self->{form};
1375
1376     my $input = $form->find_input($name);
1377     if (!$input) {
1378         $self->warn( qq{Input "$name" not found} );
1379         return;
1380     }
1381
1382     if ($input->type ne 'option') {
1383         $self->warn( qq{Input "$name" is not type "select"} );
1384         return;
1385     }
1386
1387     # For $mech->select($name, {n => 3}) or $mech->select($name, {n => [2,4]}),
1388     # transform the 'n' number(s) into value(s) and put it in $value.
1389     if (ref($value) eq 'HASH') {
1390         for (keys %$value) {
1391             $self->warn(qq{Unknown select value parameter "$_"})
1392               unless $_ eq 'n';
1393         }
1394
1395         if (defined($value->{n})) {
1396             my @inputs = $form->find_input($name, 'option');
1397             my @values = ();
1398             # distinguish between multiple and non-multiple selects
1399             # (see INPUTS section of `perldoc HTML::Form`)
1400             if (@inputs == 1) {
1401                 @values = $inputs[0]->possible_values();
1402             }
1403             else {
1404                 foreach my $input (@inputs) {
1405                     my @possible = $input->possible_values();
1406                     push @values, pop @possible;
1407                 }
1408             }
1409
1410             my $n = $value->{n};
1411             if (ref($n) eq 'ARRAY') {
1412                 $value = [];
1413                 for (@$n) {
1414                     unless (/^\d+$/) {
1415                         $self->warn(qq{"n" value "$_" is not a positive integer});
1416                         return;
1417                     }
1418                     push @$value, $values[$_ - 1];  # might be undef
1419                 }
1420             }
1421             elsif (!ref($n) && $n =~ /^\d+$/) {
1422                 $value = $values[$n - 1];           # might be undef
1423             }
1424             else {
1425                 $self->warn('"n" value is not a positive integer or an array ref');
1426                 return;
1427             }
1428         }
1429         else {
1430             $self->warn('Hash value is invalid');
1431             return;
1432         }
1433     } # hashref
1434
1435     if (ref($value) eq 'ARRAY') {
1436         $form->param($name, $value);
1437         return 1;
1438     }
1439
1440     $form->value($name => $value);
1441     return 1;
1442 }
1443
1444 =head2 $mech->set_fields( $name => $value ... )
1445
1446 This method sets multiple fields of the current form. It takes a list
1447 of field name and value pairs. If there is more than one field with
1448 the same name, the first one found is set. If you want to select which
1449 of the duplicate field to set, use a value which is an anonymous array
1450 which has the field value and its number as the 2 elements.
1451
1452         # set the second foo field
1453         $mech->set_fields( $name => [ 'foo', 2 ] );
1454
1455 The fields are numbered from 1.
1456
1457 This applies to the current form.
1458
1459 =cut
1460
1461 sub set_fields {
1462     my $self = shift;
1463     my %fields = @_;
1464
1465     my $form = $self->current_form or $self->die( 'No form defined' );
1466
1467     while ( my ( $field, $value ) = each %fields ) {
1468         if ( ref $value eq 'ARRAY' ) {
1469             $form->find_input( $field, undef,
1470                          $value->[1])->value($value->[0] );
1471         }
1472         else {
1473             $form->value($field => $value);
1474         }
1475     } # while
1476 } # set_fields()
1477
1478 =head2 $mech->set_visible( @criteria )
1479
1480 This method sets fields of the current form without having to know
1481 their names.  So if you have a login screen that wants a username and
1482 password, you do not have to fetch the form and inspect the source (or
1483 use the F<mech-dump> utility, installed with WWW::Mechanize) to see
1484 what the field names are; you can just say
1485
1486     $mech->set_visible( $username, $password );
1487
1488 and the first and second fields will be set accordingly.  The method
1489 is called set_I<visible> because it acts only on visible fields;
1490 hidden form inputs are not considered.  The order of the fields is
1491 the order in which they appear in the HTML source which is nearly
1492 always the order anyone viewing the page would think they are in,
1493 but some creative work with tables could change that; caveat user.
1494
1495 Each element in C<@criteria> is either a field value or a field
1496 specifier.  A field value is a scalar.  A field specifier allows
1497 you to specify the I<type> of input field you want to set and is
1498 denoted with an arrayref containing two elements.  So you could
1499 specify the first radio button with
1500
1501     $mech->set_visible( [ radio => 'KCRW' ] );
1502
1503 Field values and specifiers can be intermixed, hence
1504
1505     $mech->set_visible( 'fred', 'secret', [ option => 'Checking' ] );
1506
1507 would set the first two fields to "fred" and "secret", and the I<next>
1508 C<OPTION> menu field to "Checking".
1509
1510 The possible field specifier types are: "text", "password", "hidden",
1511 "textarea", "file", "image", "submit", "radio", "checkbox" and "option".
1512
1513 C<set_visible> returns the number of values set.
1514
1515 =cut
1516
1517 sub set_visible {
1518     my $self = shift;
1519
1520     my $form = $self->current_form;
1521     my @inputs = $form->inputs;
1522
1523     my $num_set = 0;
1524     for my $value ( @_ ) {
1525         # Handle type/value pairs an arrayref
1526         if ( ref $value eq 'ARRAY' ) {
1527             my ( $type, $value ) = @$value;
1528             while ( my $input = shift @inputs ) {
1529                 next if $input->type eq 'hidden';
1530                 if ( $input->type eq $type ) {
1531                     $input->value( $value );
1532                     $num_set++;
1533                     last;
1534                 }
1535             } # while
1536         }
1537         # by default, it's a value
1538         else {
1539             while ( my $input = shift @inputs ) {
1540                 next if $input->type eq 'hidden';
1541                 $input->value( $value );
1542                 $num_set++;
1543                 last;
1544             } # while
1545         }
1546     } # for
1547
1548     return $num_set;
1549 } # set_visible()
1550
1551 =head2 $mech->tick( $name, $value [, $set] )
1552
1553 "Ticks" the first checkbox that has both the name and value associated
1554 with it on the current form.  Dies if there is no named check box for
1555 that value.  Passing in a false value as the third optional argument
1556 will cause the checkbox to be unticked.
1557
1558 =cut
1559
1560 sub tick {
1561     my $self = shift;
1562     my $name = shift;
1563     my $value = shift;
1564     my $set = @_ ? shift : 1;  # default to 1 if not passed
1565
1566     # loop though all the inputs
1567     my $index = 0;
1568     while ( my $input = $self->current_form->find_input( $name, 'checkbox', $index ) ) {
1569         # Can't guarantee that the first element will be undef and the second
1570         # element will be the right name
1571         foreach my $val ($input->possible_values()) {
1572             next unless defined $val;
1573             if ($val eq $value) {
1574                 $input->value($set ? $value : undef);
1575                 return;
1576             }
1577         }
1578
1579         # move onto the next input
1580         $index++;
1581     } # while
1582
1583     # got self far?  Didn't find anything
1584     $self->warn( qq{No checkbox "$name" for value "$value" in form} );
1585 } # tick()
1586
1587 =head2 $mech->untick($name, $value)
1588
1589 Causes the checkbox to be unticked.  Shorthand for
1590 C<tick($name,$value,undef)>
1591
1592 =cut
1593
1594 sub untick {
1595     shift->tick(shift,shift,undef);
1596 }
1597
1598 =head2 $mech->value( $name [, $number] )
1599
1600 Given the name of a field, return its value. This applies to the current
1601 form.
1602
1603 The optional I<$number> parameter is used to distinguish between two fields
1604 with the same name.  The fields are numbered from 1.
1605
1606 If the field is of type file (file upload field), the value is always
1607 cleared to prevent remote sites from downloading your local files.
1608 To upload a file, specify its file name explicitly.
1609
1610 =cut
1611
1612 sub value {
1613     my $self = shift;
1614     my $name = shift;
1615     my $number = shift || 1;
1616
1617     my $form = $self->{form};
1618     if ( $number > 1 ) {
1619         return $form->find_input( $name, undef, $number )->value();
1620     }
1621     else {
1622         return $form->value( $name );
1623     }
1624 } # value
1625
1626 =head2 $mech->click( $button [, $x, $y] )
1627
1628 Has the effect of clicking a button on the current form.  The first
1629 argument is the name of the button to be clicked.  The second and
1630 third arguments (optional) allow you to specify the (x,y) coordinates
1631 of the click.
1632
1633 If there is only one button on the form, C<< $mech->click() >> with
1634 no arguments simply clicks that one button.
1635
1636 Returns an L<HTTP::Response> object.
1637
1638 =cut
1639
1640 sub click {
1641     my ($self, $button, $x, $y) = @_;
1642     for ($x, $y) { $_ = 1 unless defined; }
1643     my $request = $self->{form}->click($button, $x, $y);
1644     return $self->request( $request );
1645 }
1646
1647 =head2 $mech->click_button( ... )
1648
1649 Has the effect of clicking a button on the current form by specifying
1650 its name, value, or index.  Its arguments are a list of key/value
1651 pairs.  Only one of name, number, input or value must be specified in
1652 the keys.
1653
1654 =over 4
1655
1656 =item * C<< name => name >>
1657
1658 Clicks the button named I<name> in the current form.
1659
1660 =item * C<< number => n >>
1661
1662 Clicks the I<n>th button in the current form. Numbering starts at 1.
1663
1664 =item * C<< value => value >>
1665
1666 Clicks the button with the value I<value> in the current form.
1667
1668 =item * C<< input => $inputobject >>
1669
1670 Clicks on the button referenced by $inputobject, an instance of
1671 L<HTML::Form::SubmitInput> obtained e.g. from
1672
1673     $mech->current_form()->find_input( undef, 'submit' )
1674
1675 $inputobject must belong to the current form.
1676
1677 =item * C<< x => x >>
1678
1679 =item * C<< y => y >>
1680
1681 These arguments (optional) allow you to specify the (x,y) coordinates
1682 of the click.
1683
1684 =back
1685
1686 =cut
1687
1688 sub click_button {
1689     my $self = shift;
1690     my %args = @_;
1691
1692     for ( keys %args ) {
1693         if ( !/^(number|name|value|input|x|y)$/ ) {
1694             $self->warn( qq{Unknown click_button parameter "$_"} );
1695         }
1696     }
1697
1698     for ($args{x}, $args{y}) {
1699         $_ = 1 unless defined;
1700     }
1701
1702     my $form = $self->{form};
1703     my $request;
1704     if ( $args{name} ) {
1705         $request = $form->click( $args{name}, $args{x}, $args{y} );
1706     }
1707     elsif ( $args{number} ) {
1708         my $input = $form->find_input( undef, 'submit', $args{number} );
1709         $request = $input->click( $form, $args{x}, $args{y} );
1710     }
1711     elsif ( $args{input} ) {
1712         $request = $args{input}->click( $form, $args{x}, $args{y} );
1713     }
1714     elsif ( $args{value} ) {
1715         my $i = 1;
1716         while ( my $input = $form->find_input(undef, 'submit', $i) ) {
1717             if ( $args{value} && ($args{value} eq $input->value) ) {
1718                 $request = $input->click( $form, $args{x}, $args{y} );
1719                 last;
1720             }
1721             $i++;
1722         } # while
1723     } # $args{value}
1724
1725     return $self->request( $request );
1726 }
1727
1728 =head2 $mech->submit()
1729
1730 Submits the page, without specifying a button to click.  Actually,
1731 no button is clicked at all.
1732
1733 Returns an L<HTTP::Response> object.
1734
1735 This used to be a synonym for C<< $mech->click( 'submit' ) >>, but is no
1736 longer so.
1737
1738 =cut
1739
1740 sub submit {
1741     my $self = shift;
1742
1743     my $request = $self->{form}->make_request;
1744     return $self->request( $request );
1745 }
1746
1747 =head2 $mech->submit_form( ... )
1748
1749 This method lets you select a form from the previously fetched page,
1750 fill in its fields, and submit it. It combines the form_number/form_name,
1751 set_fields and click methods into one higher level call. Its arguments
1752 are a list of key/value pairs, all of which are optional.
1753
1754 =over 4
1755
1756 =item * C<< fields => \%fields >>
1757
1758 Specifies the fields to be filled in the current form.
1759
1760 =item * C<< with_fields => \%fields >>
1761
1762 Probably all you need for the common case. It combines a smart form selector
1763 and data setting in one operation. It selects the first form that contains all
1764 fields mentioned in C<\%fields>.  This is nice because you don't need to know
1765 the name or number of the form to do this.
1766
1767 (calls C<L</form_with_fields()>> and C<L</set_fields()>>).
1768
1769 If you choose this, the form_number, form_name, form_id and fields options will be ignored.
1770
1771 =item * C<< form_number => n >>
1772
1773 Selects the I<n>th form (calls C<L</form_number()>>).  If this parm is not
1774 specified, the currently-selected form is used.
1775
1776 =item * C<< form_name => name >>
1777
1778 Selects the form named I<name> (calls C<L</form_name()>>)
1779
1780 =item * C<< form_id => ID >>
1781
1782 Selects the form with ID I<ID> (calls C<L</form_id()>>)
1783
1784 =item * C<< button => button >>
1785
1786 Clicks on button I<button> (calls C<L</click()>>)
1787
1788 =item * C<< x => x, y => y >>
1789
1790 Sets the x or y values for C<L</click()>>
1791
1792 =back
1793
1794 If no form is selected, the first form found is used.
1795
1796 If I<button> is not passed, then the C<L</submit()>> method is used instead.
1797
1798 Returns an L<HTTP::Response> object.
1799
1800 =cut
1801
1802 sub submit_form {
1803     my( $self, %args ) = @_;
1804
1805     for ( keys %args ) {
1806         if ( !/^(form_(number|name|fields|id)|(with_)?fields|button|x|y)$/ ) {
1807             # XXX Why not die here?
1808             $self->warn( qq{Unknown submit_form parameter "$_"} );
1809         }
1810     }
1811
1812     my $fields;
1813     for (qw/with_fields fields/) {
1814         if ($args{$_}) {
1815             if ( ref $args{$_} eq 'HASH' ) {
1816                 $fields = $args{$_};
1817             }
1818             else {
1819                 die "$_ arg to submit_form must be a hashref";
1820             }
1821             last;
1822         }
1823     }
1824
1825     if ( $args{with_fields} ) {
1826         $fields || die q{must submit some 'fields' with with_fields};
1827         $self->form_with_fields(keys %{$fields}) or die "There is no form with the requested fields";
1828     }
1829     elsif ( my $form_number = $args{form_number} ) {
1830         $self->form_number( $form_number ) or die "There is no form numbered $form_number";
1831     }
1832     elsif ( my $form_name = $args{form_name} ) {
1833         $self->form_name( $form_name ) or die qq{There is no form named "$form_name"};
1834     }
1835     elsif ( my $form_id = $args{form_id} ) {
1836         $self->form_id( $form_id ) or die qq{There is no form with ID "$form_id"};
1837     }
1838     else {
1839         # No form selector was used.
1840         # Maybe a form was set separately, or we'll default to the first form.
1841     }
1842
1843     $self->set_fields( %{$fields} ) if $fields;
1844
1845     my $response;
1846     if ( $args{button} ) {
1847         $response = $self->click( $args{button}, $args{x} || 0, $args{y} || 0 );
1848     }
1849     else {
1850         $response = $self->submit();
1851     }
1852
1853     return $response;
1854 }
1855
1856 =head1 MISCELLANEOUS METHODS
1857
1858 =head2 $mech->add_header( name => $value [, name => $value... ] )
1859
1860 Sets HTTP headers for the agent to add or remove from the HTTP request.
1861
1862     $mech->add_header( Encoding => 'text/klingon' );
1863
1864 If a I<value> is C<undef>, then that header will be removed from any
1865 future requests.  For example, to never send a Referer header:
1866
1867     $mech->add_header( Referer => undef );
1868
1869 If you want to delete a header, use C<delete_header>.
1870
1871 Returns the number of name/value pairs added.
1872
1873 B<NOTE>: This method was very different in WWW::Mechanize before 1.00.
1874 Back then, the headers were stored in a package hash, not as a member of
1875 the object instance.  Calling C<add_header()> would modify the headers
1876 for every WWW::Mechanize object, even after your object no longer existed.
1877
1878 =cut
1879
1880 sub add_header {
1881     my $self = shift;
1882     my $npairs = 0;
1883
1884     while ( @_ ) {
1885         my $key = shift;
1886         my $value = shift;
1887         ++$npairs;
1888
1889         $self->{headers}{$key} = $value;
1890     }
1891
1892     return $npairs;
1893 }
1894
1895 =head2 $mech->delete_header( name [, name ... ] )
1896
1897 Removes HTTP headers from the agent's list of special headers.  For
1898 instance, you might need to do something like:
1899
1900     # Don't send a Referer for this URL
1901     $mech->add_header( Referer => undef );
1902
1903     # Get the URL
1904     $mech->get( $url );
1905
1906     # Back to the default behavior
1907     $mech->delete_header( 'Referer' );
1908
1909 =cut
1910
1911 sub delete_header {
1912     my $self = shift;
1913
1914     while ( @_ ) {
1915         my $key = shift;
1916
1917         delete $self->{headers}{$key};
1918     }
1919
1920     return;
1921 }
1922
1923
1924 =head2 $mech->quiet(true/false)
1925
1926 Allows you to suppress warnings to the screen.
1927
1928     $mech->quiet(0); # turns on warnings (the default)
1929     $mech->quiet(1); # turns off warnings
1930     $mech->quiet();  # returns the current quietness status
1931
1932 =cut
1933
1934 sub quiet {
1935     my $self = shift;
1936
1937     $self->{quiet} = $_[0] if @_;
1938
1939     return $self->{quiet};
1940 }
1941
1942 =head2 $mech->stack_depth( $max_depth )
1943
1944 Get or set the page stack depth. Use this if you're doing a lot of page
1945 scraping and running out of memory.
1946
1947 A value of 0 means "no history at all."  By default, the max stack depth
1948 is humongously large, effectively keeping all history.
1949
1950 =cut
1951
1952 sub stack_depth {
1953     my $self = shift;
1954     $self->{stack_depth} = shift if @_;
1955     return $self->{stack_depth};
1956 }
1957
1958 =head2 $mech->save_content( $filename )
1959
1960 Dumps the contents of C<< $mech->content >> into I<$filename>.
1961 I<$filename> will be overwritten.  Dies if there are any errors.
1962
1963 If the content type does not begin with "text/", then the content
1964 is saved in binary mode.
1965
1966 =cut
1967
1968 sub save_content {
1969     my $self = shift;
1970     my $filename = shift;
1971
1972     open( my $fh, '>', $filename ) or $self->die( "Unable to create $filename: $!" );
1973     binmode $fh unless $self->content_type =~ m{^text/};
1974     print {$fh} $self->content or $self->die( "Unable to write to $filename: $!" );
1975     close $fh or $self->die( "Unable to close $filename: $!" );
1976
1977     return;
1978 }
1979
1980
1981 =head2 $mech->dump_headers( [$fh] )
1982
1983 Prints a dump of the HTTP response headers for the most recent
1984 response.  If I<$fh> is not specified or is undef, it dumps to
1985 STDOUT.
1986
1987 Unlike the rest of the dump_* methods, you cannot specify a filehandle
1988 to print to.
1989
1990 =cut
1991
1992 sub dump_headers {
1993     my $self = shift;
1994     my $fh   = shift || \*STDOUT;
1995
1996     print {$fh} $self->response->headers_as_string;
1997
1998     return;
1999 }
2000
2001
2002 =head2 $mech->dump_links( [[$fh], $absolute] )
2003
2004 Prints a dump of the links on the current page to I<$fh>.  If I<$fh>
2005 is not specified or is undef, it dumps to STDOUT.
2006
2007 If I<$absolute> is true, links displayed are absolute, not relative.
2008
2009 =cut
2010
2011 sub dump_links {
2012     my $self = shift;
2013     my $fh = shift || \*STDOUT;
2014     my $absolute = shift;
2015
2016     for my $link ( $self->links ) {
2017         my $url = $absolute ? $link->url_abs : $link->url;
2018         $url = '' if not defined $url;
2019         print {$fh} $url, "\n";
2020     }
2021     return;
2022 }
2023
2024 =head2 $mech->dump_images( [[$fh], $absolute] )
2025
2026 Prints a dump of the images on the current page to I<$fh>.  If I<$fh>
2027 is not specified or is undef, it dumps to STDOUT.
2028
2029 If I<$absolute> is true, links displayed are absolute, not relative.
2030
2031 =cut
2032
2033 sub dump_images {
2034     my $self = shift;
2035     my $fh = shift || \*STDOUT;
2036     my $absolute = shift;
2037
2038     for my $image ( $self->images ) {
2039         my $url = $absolute ? $image->url_abs : $image->url;
2040         $url = '' if not defined $url;
2041         print {$fh} $url, "\n";
2042     }
2043     return;
2044 }
2045
2046 =head2 $mech->dump_forms( [$fh] )
2047
2048 Prints a dump of the forms on the current page to I<$fh>.  If I<$fh>
2049 is not specified or is undef, it dumps to STDOUT.
2050
2051 =cut
2052
2053 sub dump_forms {
2054     my $self = shift;
2055     my $fh = shift || \*STDOUT;
2056
2057     for my $form ( $self->forms ) {
2058         print {$fh} $form->dump, "\n";
2059     }
2060     return;
2061 }
2062
2063 =head2 $mech->dump_all( [[$fh], $absolute] )
2064
2065 Prints a dump of all links, images and forms on the current page to
2066 I<$fh>.  If I<$fh> is not specified or is undef, it dumps to STDOUT.
2067
2068 If I<$absolute> is true, links displayed are absolute, not relative.
2069
2070 =cut
2071
2072 sub dump_all {
2073     my $self = shift;
2074     my $fh = shift || \*STDOUT;
2075     my $absolute = shift;
2076
2077     $self->dump_links( $fh, $absolute );
2078     $self->dump_images( $fh, $absolute );
2079     $self->dump_forms( $fh, $absolute );
2080
2081     return;
2082 }
2083
2084
2085 =head1 OVERRIDDEN LWP::UserAgent METHODS
2086
2087 =head2 $mech->clone()
2088
2089 Clone the mech object.  The clone will be using the same cookie jar
2090 as the original mech.
2091
2092 =cut
2093
2094 sub clone {
2095     my $self  = shift;
2096     my $clone = $self->SUPER::clone();
2097
2098     $clone->cookie_jar( $self->cookie_jar );
2099
2100     return $clone;
2101 }
2102
2103
2104 =head2 $mech->redirect_ok()
2105
2106 An overloaded version of C<redirect_ok()> in L<LWP::UserAgent>.
2107 This method is used to determine whether a redirection in the request
2108 should be followed.
2109
2110 Note that WWW::Mechanize's constructor pushes POST on to the agent's
2111 C<requests_redirectable> list.
2112
2113 =cut
2114
2115 sub redirect_ok {
2116     my $self = shift;
2117     my $prospective_request = shift;
2118     my $response = shift;
2119
2120     my $ok = $self->SUPER::redirect_ok( $prospective_request, $response );
2121     if ( $ok ) {
2122         $self->{redirected_uri} = $prospective_request->uri;
2123     }
2124
2125     return $ok;
2126 }
2127
2128
2129 =head2 $mech->request( $request [, $arg [, $size]])
2130
2131 Overloaded version of C<request()> in L<LWP::UserAgent>.  Performs
2132 the actual request.  Normally, if you're using WWW::Mechanize, it's
2133 because you don't want to deal with this level of stuff anyway.
2134
2135 Note that C<$request> will be modified.
2136
2137 Returns an L<HTTP::Response> object.
2138
2139 =cut
2140
2141 sub request {
2142     my $self = shift;
2143     my $request = shift;
2144
2145     $request = $self->_modify_request( $request );
2146
2147     if ( $request->method eq 'GET' || $request->method eq 'POST' ) {
2148         $self->_push_page_stack();
2149     }
2150
2151     $self->_update_page($request, $self->_make_request( $request, @_ ));
2152
2153     # XXX This should definitively return something.
2154 }
2155
2156 =head2 $mech->update_html( $html )
2157
2158 Allows you to replace the HTML that the mech has found.  Updates the
2159 forms and links parse-trees that the mech uses internally.
2160
2161 Say you have a page that you know has malformed output, and you want to
2162 update it so the links come out correctly:
2163
2164     my $html = $mech->content;
2165     $html =~ s[</option>.{0,3}</td>][</option></select></td>]isg;
2166     $mech->update_html( $html );
2167
2168 This method is also used internally by the mech itself to update its
2169 own HTML content when loading a page. This means that if you would
2170 like to I<systematically> perform the above HTML substitution, you
2171 would overload I<update_html> in a subclass thusly:
2172
2173    package MyMech;
2174    use base 'WWW::Mechanize';
2175
2176    sub update_html {
2177        my ($self, $html) = @_;
2178        $html =~ s[</option>.{0,3}</td>][</option></select></td>]isg;
2179        $self->WWW::Mechanize::update_html( $html );
2180    }
2181
2182 If you do this, then the mech will use the tidied-up HTML instead of
2183 the original both when parsing for its own needs, and for returning to
2184 you through L</content>.
2185
2186 Overloading this method is also the recommended way of implementing
2187 extra validation steps (e.g. link checkers) for every HTML page
2188 received.  L</warn> and L</die> would then come in handy to signal
2189 validation errors.
2190
2191 =cut
2192
2193 sub update_html {
2194     my $self = shift;
2195     my $html = shift;
2196
2197     $self->_reset_page;
2198     $self->{ct} = 'text/html';
2199     $self->{content} = $html;
2200
2201     $self->{forms} = [ HTML::Form->parse($html, $self->base) ];
2202     for my $form (@{ $self->{forms} }) {
2203         for my $input ($form->inputs) {
2204              if ($input->type eq 'file') {
2205                  $input->value( undef );
2206              }
2207         }
2208     }
2209     $self->{form}  = $self->{forms}->[0];
2210     $self->{_extracted_links} = 0;
2211     $self->{_extracted_images} = 0;
2212
2213     return;
2214 }
2215
2216 =head2 $mech->credentials( $username, $password )
2217
2218 Provide credentials to be used for HTTP Basic authentication for
2219 all sites and realms until further notice.
2220
2221 The four argument form described in L<LWP::UserAgent> is still
2222 supported.
2223
2224 =cut
2225
2226 sub credentials {
2227     my $self = shift;
2228
2229     # The lastest LWP::UserAgent also supports 2 arguments,
2230     # in which case the first is host:port
2231     if (@_ == 4 || (@_ == 2 && $_[0] =~ /:\d+$/)) {
2232         return $self->SUPER::credentials(@_);
2233     }
2234
2235     @_ == 2
2236         or $self->die( 'Invalid # of args for overridden credentials()' );
2237
2238     return @$self{qw( __username __password )} = @_;
2239 }
2240
2241 =head2 $mech->get_basic_credentials( $realm, $uri, $isproxy )
2242
2243 Returns the credentials for the realm and URI.
2244
2245 =cut
2246
2247 sub get_basic_credentials {
2248     my $self = shift;
2249     my @cred = grep { defined } @$self{qw( __username __password )};
2250     return @cred if @cred == 2;
2251     return $self->SUPER::get_basic_credentials(@_);
2252 }
2253
2254 =head2 $mech->clear_credentials()
2255
2256 Remove any credentials set up with C<credentials()>.
2257
2258 =cut
2259
2260 sub clear_credentials {
2261     my $self = shift;
2262     delete @$self{qw( __username __password )};
2263 }
2264
2265 =head1 INTERNAL-ONLY METHODS
2266
2267 These methods are only used internally.  You probably don't need to
2268 know about them.
2269
2270 =head2 $mech->_update_page($request, $response)
2271
2272 Updates all internal variables in $mech as if $request was just
2273 performed, and returns $response. The page stack is B<not> altered by
2274 this method, it is up to caller (e.g. L</request>) to do that.
2275
2276 =cut
2277
2278 sub _update_page {
2279     my ($self, $request, $res) = @_;
2280
2281     $self->{req} = $request;
2282     $self->{redirected_uri} = $request->uri->as_string;
2283
2284     $self->{res} = $res;
2285
2286     $self->{status}  = $res->code;
2287     $self->{base}    = $res->base;
2288     $self->{ct}      = $res->content_type || '';
2289
2290     if ( $res->is_success ) {
2291         $self->{uri} = $self->{redirected_uri};
2292         $self->{last_uri} = $self->{uri};
2293     }
2294
2295     if ( $res->is_error ) {
2296         if ( $self->{autocheck} ) {
2297             $self->die( 'Error ', $request->method, 'ing ', $request->uri, ': ', $res->message );
2298         }
2299     }
2300
2301     $self->_reset_page;
2302
2303     # Try to decode the content. Undef will be returned if there's nothing to decompress.
2304     # See docs in HTTP::Message for details. Do we need to expose the options there?
2305     my $content = $res->decoded_content();
2306     $content = $res->content if (not defined $content);
2307
2308     $content .= _taintedness();
2309
2310     if ($self->is_html) {
2311         $self->update_html($content);
2312     }
2313     else {
2314         $self->{content} = $content;
2315     }
2316
2317     return $res;
2318 } # _update_page
2319
2320 our $_taintbrush;
2321
2322 # This is lifted wholesale from Test::Taint
2323 sub _taintedness {
2324     return $_taintbrush if defined $_taintbrush;
2325
2326     # Somehow we need to get some taintedness into our $_taintbrush.
2327     # Let's try the easy way first. Either of these should be
2328     # tainted, unless somebody has untainted them, so this
2329     # will almost always work on the first try.
2330     # (Unless, of course, taint checking has been turned off!)
2331     $_taintbrush = substr("$0$^X", 0, 0);
2332     return $_taintbrush if _is_tainted( $_taintbrush );
2333
2334     # Let's try again. Maybe somebody cleaned those.
2335     $_taintbrush = substr(join("", @ARGV, %ENV), 0, 0);
2336     return $_taintbrush if _is_tainted( $_taintbrush );
2337
2338     # If those don't work, go try to open some file from some unsafe
2339     # source and get data from them.  That data is tainted.
2340     # (Yes, even reading from /dev/null works!)
2341     for my $filename ( qw(/dev/null / . ..), values %INC, $0, $^X ) {
2342         if ( open my $fh, '<', $filename ) {
2343             my $data;
2344             if ( defined sysread $fh, $data, 1 ) {
2345                 $_taintbrush = substr( $data, 0, 0 );
2346                 last if _is_tainted( $_taintbrush );
2347             }
2348         }
2349     }
2350
2351     # Sanity check
2352     die "Our taintbrush should have zero length!" if length $_taintbrush;
2353
2354     return $_taintbrush;
2355 }
2356
2357 sub _is_tainted {
2358     no warnings qw(void uninitialized);
2359
2360     return !eval { join('', shift), kill 0; 1 };
2361 } # _is_tainted
2362
2363
2364 =head2 $mech->_modify_request( $req )
2365
2366 Modifies a L<HTTP::Request> before the request is sent out,
2367 for both GET and POST requests.
2368
2369 We add a C<Referer> header, as well as header to note that we can accept gzip
2370 encoded content, if L<Compress::Zlib> is installed.
2371
2372 =cut
2373
2374 sub _modify_request {
2375     my $self = shift;
2376     my $req = shift;
2377
2378     # add correct Accept-Encoding header to restore compliance with
2379     # http://www.freesoft.org/CIE/RFC/2068/158.htm
2380     # http://use.perl.org/~rhesa/journal/25952
2381     if (not $req->header( 'Accept-Encoding' ) ) {
2382         # "identity" means "please! unencoded content only!"
2383         $req->header( 'Accept-Encoding', $HAS_ZLIB ? 'gzip' : 'identity' );
2384     }
2385
2386     my $last = $self->{last_uri};
2387     if ( $last ) {
2388         $last = $last->as_string if ref($last);
2389         $req->header( Referer => $last );
2390     }
2391     while ( my($key,$value) = each %{$self->{headers}} ) {
2392         if ( defined $value ) {
2393             $req->header( $key => $value );
2394         }
2395         else {
2396             $req->remove_header( $key );
2397         }
2398     }
2399
2400     return $req;
2401 }
2402
2403
2404 =head2 $mech->_make_request()
2405
2406 Convenience method to make it easier for subclasses like
2407 L<WWW::Mechanize::Cached> to intercept the request.
2408
2409 =cut
2410
2411 sub _make_request {
2412     my $self = shift;
2413     return $self->SUPER::request(@_);
2414 }
2415
2416 =head2 $mech->_reset_page()
2417
2418 Resets the internal fields that track page parsed stuff.
2419
2420 =cut
2421
2422 sub _reset_page {
2423     my $self = shift;
2424
2425     $self->{_extracted_links}  = 0;
2426     $self->{_extracted_images} = 0;
2427     $self->{links}             = [];
2428     $self->{images}            = [];
2429     $self->{forms}             = [];
2430
2431     delete $self->{form};
2432
2433     return;
2434 }
2435
2436 =head2 $mech->_extract_links()
2437
2438 Extracts links from the content of a webpage, and populates the C<{links}>
2439 property with L<WWW::Mechanize::Link> objects.
2440
2441 =cut
2442
2443 my %link_tags = (
2444     a      => 'href',
2445     area   => 'href',
2446     frame  => 'src',
2447     iframe => 'src',
2448     link   => 'href',
2449     meta   => 'content',
2450 );
2451
2452 sub _extract_links {
2453     my $self = shift;
2454
2455
2456     $self->{links} = [];
2457     if ( defined $self->{content} ) {
2458         my $parser = HTML::TokeParser->new(\$self->{content});
2459         while ( my $token = $parser->get_tag( keys %link_tags ) ) {
2460             my $link = $self->_link_from_token( $token, $parser );
2461             push( @{$self->{links}}, $link ) if $link;
2462         } # while
2463     }
2464
2465     $self->{_extracted_links} = 1;
2466
2467     return;
2468 }
2469
2470
2471 my %image_tags = (
2472     img   => 'src',
2473     input => 'src',
2474 );
2475
2476 sub _extract_images {
2477     my $self = shift;
2478
2479     $self->{images} = [];
2480
2481     if ( defined $self->{content} ) {
2482         my $parser = HTML::TokeParser->new(\$self->{content});
2483         while ( my $token = $parser->get_tag( keys %image_tags ) ) {
2484             my $image = $self->_image_from_token( $token, $parser );
2485             push( @{$self->{images}}, $image ) if $image;
2486         } # while
2487     }
2488
2489     $self->{_extracted_images} = 1;
2490
2491     return;
2492 }
2493
2494 sub _image_from_token {
2495     my $self = shift;
2496     my $token = shift;
2497     my $parser = shift;
2498
2499     my $tag = $token->[0];
2500     my $attrs = $token->[1];
2501
2502     if ( $tag eq 'input' ) {
2503         my $type = $attrs->{type} or return;
2504         return unless $type eq 'image';
2505     }
2506
2507     require WWW::Mechanize::Image;
2508     return
2509         WWW::Mechanize::Image->new({
2510             tag     => $tag,
2511             base    => $self->base,
2512             url     => $attrs->{src},
2513             name    => $attrs->{name},
2514             height  => $attrs->{height},
2515             width   => $attrs->{width},
2516             alt     => $attrs->{alt},
2517         });
2518 }
2519
2520 sub _link_from_token {
2521     my $self = shift;
2522     my $token = shift;
2523     my $parser = shift;
2524
2525     my $tag = $token->[0];
2526     my $attrs = $token->[1];
2527     my $url = $attrs->{$link_tags{$tag}};
2528
2529     my $text;
2530     my $name;
2531     if ( $tag eq 'a' ) {
2532         $text = $parser->get_trimmed_text("/$tag");
2533         $text = '' unless defined $text;
2534
2535         my $onClick = $attrs->{onclick};
2536         if ( $onClick && ($onClick =~ /^window\.open\(\s*'([^']+)'/) ) {
2537             $url = $1;
2538         }
2539     } # a
2540
2541     # Of the tags we extract from, only 'AREA' has an alt tag
2542     # The rest should have a 'name' attribute.
2543     # ... but we don't do anything with that bit of wisdom now.
2544
2545     $name = $attrs->{name};
2546
2547     if ( $tag eq 'meta' ) {
2548         my $equiv = $attrs->{'http-equiv'};
2549         my $content = $attrs->{'content'};
2550         return unless $equiv && (lc $equiv eq 'refresh') && defined $content;
2551
2552         if ( $content =~ /^\d+\s*;\s*url\s*=\s*(\S+)/i ) {
2553             $url = $1;
2554             $url =~ s/^"(.+)"$/$1/ or $url =~ s/^'(.+)'$/$1/;
2555         }
2556         else {
2557             undef $url;
2558         }
2559     } # meta
2560
2561     return unless defined $url;   # probably just a name link or <AREA NOHREF...>
2562
2563     require WWW::Mechanize::Link;
2564     return
2565         WWW::Mechanize::Link->new({
2566             url  => $url,
2567             text => $text,
2568             name => $name,
2569             tag  => $tag,
2570             base => $self->base,
2571             attrs => $attrs,
2572         });
2573 } # _link_from_token
2574
2575 =head2 $mech->_push_page_stack()
2576
2577 The agent keeps a stack of visited pages, which it can pop when it needs
2578 to go BACK and so on.
2579
2580 The current page needs to be pushed onto the stack before we get a new
2581 page, and the stack needs to be popped when BACK occurs.
2582
2583 Neither of these take any arguments, they just operate on the $mech
2584 object.
2585
2586 =cut
2587
2588 sub _push_page_stack {
2589     my $self = shift;
2590
2591     my $req = $self->{req};
2592     my $res = $self->{res};
2593
2594     return unless $req && $res && $self->stack_depth;
2595
2596     # Don't push anything if it's a virgin object
2597     my $stack = $self->{page_stack} ||= [];
2598     if ( @{$stack} >= $self->stack_depth ) {
2599         shift @{$stack};
2600     }
2601     push( @{$stack}, { req => $req, res => $res } );
2602
2603     return 1;
2604 }
2605
2606 =head2 warn( @messages )
2607
2608 Centralized warning method, for diagnostics and non-fatal problems.
2609 Defaults to calling C<CORE::warn>, but may be overridden by setting
2610 C<onwarn> in the constructor.
2611
2612 =cut
2613
2614 sub warn {
2615     my $self = shift;
2616
2617     return unless my $handler = $self->{onwarn};
2618
2619     return if $self->quiet;
2620
2621     return $handler->(@_);
2622 }
2623
2624 =head2 die( @messages )
2625
2626 Centralized error method.  Defaults to calling C<CORE::die>, but
2627 may be overridden by setting C<onerror> in the constructor.
2628
2629 =cut
2630
2631 sub die {
2632     my $self = shift;
2633
2634     return unless my $handler = $self->{onerror};
2635
2636     return $handler->(@_);
2637 }
2638
2639
2640 # NOT an object method!
2641 sub _warn {
2642     require Carp;
2643     return &Carp::carp; ## no critic
2644 }
2645
2646 # NOT an object method!
2647 sub _die {
2648     require Carp;
2649     return &Carp::croak; ## no critic
2650 }
2651
2652 1; # End of module
2653
2654 __END__
2655
2656 =head1 REQUESTS & BUGS
2657
2658 The bug queue for WWW::Mechanize and Test::WWW::Mechanize is at
2659 L<http://code.google.com/p/www-mechanize/issues/list>.  Please do
2660 not add any tickets to the old queue at L<http://rt.cpan.org/>.
2661
2662 =head1 WWW::MECHANIZE'S SUBVERSION REPOSITORY
2663
2664 Mech and Test::WWW::Mechanize are both hosted at Google Code:
2665 http://code.google.com/p/www-mechanize/.  The Subversion repository
2666 is at http://www-mechanize.googlecode.com/svn/wm/.
2667
2668 =head1 OTHER DOCUMENTATION
2669
2670 =head2 I<Spidering Hacks>, by Kevin Hemenway and Tara Calishain
2671
2672 I<Spidering Hacks> from O'Reilly
2673 (L<http://www.oreilly.com/catalog/spiderhks/>) is a great book for anyone
2674 wanting to know more about screen-scraping and spidering.
2675
2676 There are six hacks that use Mech or a Mech derivative:
2677
2678 =over 4
2679
2680 =item #21 WWW::Mechanize 101
2681
2682 =item #22 Scraping with WWW::Mechanize
2683
2684 =item #36 Downloading Images from Webshots
2685
2686 =item #44 Archiving Yahoo! Groups Messages with WWW::Yahoo::Groups
2687
2688 =item #64 Super Author Searching
2689
2690 =item #73 Scraping TV Listings
2691
2692 =back
2693
2694 The book was also positively reviewed on Slashdot:
2695 L<http://books.slashdot.org/article.pl?sid=03/12/11/2126256>
2696
2697 =head1 ONLINE RESOURCES AND SUPPORT
2698
2699 =over 4
2700
2701 =item * WWW::Mechanize mailing list
2702
2703 The Mech mailing list is at
2704 L<http://groups.google.com/group/www-mechanize-users> and is specific
2705 to Mechanize, unlike the LWP mailing list below.  Although it is a
2706 users list, all development discussion takes place here, too.
2707
2708 =item * LWP mailing list
2709
2710 The LWP mailing list is at
2711 L<http://lists.perl.org/showlist.cgi?name=libwww>, and is more
2712 user-oriented and well-populated than the WWW::Mechanize list.
2713
2714 =item * Perlmonks
2715
2716 L<http://perlmonks.org> is an excellent community of support, and
2717 many questions about Mech have already been answered there.
2718
2719 =item * L<WWW::Mechanize::Examples>
2720
2721 A random array of examples submitted by users, included with the
2722 Mechanize distribution.
2723
2724 =back
2725
2726 =head1 ARTICLES ABOUT WWW::MECHANIZE
2727
2728 =over 4
2729
2730 =item * L<http://www-128.ibm.com/developerworks/linux/library/wa-perlsecure.html>
2731
2732 IBM article "Secure Web site access with Perl"
2733
2734 =item * L<http://www.oreilly.com/catalog/googlehks2/chapter/hack84.pdf>
2735
2736 Leland Johnson's hack #84 in I<Google Hacks, 2nd Edition> is
2737 an example of a production script that uses WWW::Mechanize and
2738 HTML::TableContentParser. It takes in keywords and returns the estimated
2739 price of these keywords on Google's AdWords program.
2740
2741 =item * L<http://www.perl.com/pub/a/2004/06/04/recorder.html>
2742
2743 Linda Julien writes about using HTTP::Recorder to create WWW::Mechanize
2744 scripts.
2745
2746 =item * L<http://www.developer.com/lang/other/article.php/3454041>
2747
2748 Jason Gilmore's article on using WWW::Mechanize for scraping sales
2749 information from Amazon and eBay.
2750
2751 =item * L<http://www.perl.com/pub/a/2003/01/22/mechanize.html>
2752
2753 Chris Ball's article about using WWW::Mechanize for scraping TV
2754 listings.
2755
2756 =item * L<http://www.stonehenge.com/merlyn/LinuxMag/col47.html>
2757
2758 Randal Schwartz's article on scraping Yahoo News for images.  It's
2759 already out of date: He manually walks the list of links hunting
2760 for matches, which wouldn't have been necessary if the C<find_link()>
2761 method existed at press time.
2762
2763 =item * L<http://www.perladvent.org/2002/16th/>
2764
2765 WWW::Mechanize on the Perl Advent Calendar, by Mark Fowler.
2766
2767 =item * L<http://www.linux-magazin.de/Artikel/ausgabe/2004/03/perl/perl.html>
2768
2769 Michael Schilli's article on Mech and L<WWW::Mechanize::Shell> for the
2770 German magazine I<Linux Magazin>.
2771
2772 =back
2773
2774 =head2 Other modules that use Mechanize
2775
2776 Here are modules that use or subclass Mechanize.  Let me know of any others:
2777
2778 =over 4
2779
2780 =item * L<Finance::Bank::LloydsTSB>
2781
2782 =item * L<HTTP::Recorder>
2783
2784 Acts as a proxy for web interaction, and then generates WWW::Mechanize scripts.
2785
2786 =item * L<Win32::IE::Mechanize>
2787
2788 Just like Mech, but using Microsoft Internet Explorer to do the work.
2789
2790 =item * L<WWW::Bugzilla>
2791
2792 =item * L<WWW::CheckSite>
2793
2794 =item * L<WWW::Google::Groups>
2795
2796 =item * L<WWW::Hotmail>
2797
2798 =item * L<WWW::Mechanize::Cached>
2799
2800 =item * L<WWW::Mechanize::FormFiller>
2801
2802 =item * L<WWW::Mechanize::Shell>
2803
2804 =item * L<WWW::Mechanize::Sleepy>
2805
2806 =item * L<WWW::Mechanize::SpamCop>
2807
2808 =item * L<WWW::Mechanize::Timed>
2809
2810 =item * L<WWW::SourceForge>
2811
2812 =item * L<WWW::Yahoo::Groups>
2813
2814 =back
2815
2816 =head1 ACKNOWLEDGEMENTS
2817
2818 Thanks to the numerous people who have helped out on WWW::Mechanize in
2819 one way or another, including
2820 Kirrily Robert for the original C<WWW::Automate>,
2821 Gisle Aas,
2822 Jeremy Ary,
2823 Hilary Holz,
2824 Rafael Kitover,
2825 Norbert Buchmuller,
2826 Dave Page,
2827 David Sainty,
2828 H.Merijn Brand,
2829 Matt Lawrence,
2830 Michael Schwern,
2831 Adriano Ferreira,
2832 Miyagawa,
2833 Peteris Krumins,
2834 Rafael Kitover,
2835 David Steinbrunner,
2836 Kevin Falcone,
2837 Mike O'Regan,
2838 Mark Stosberg,
2839 Uri Guttman,
2840 Peter Scott,
2841 Phillipe Bruhat,
2842 Ian Langworth,
2843 John Beppu,
2844 Gavin Estey,
2845 Jim Brandt,
2846 Ask Bjoern Hansen,
2847 Greg Davies,
2848 Ed Silva,
2849 Mark-Jason Dominus,
2850 Autrijus Tang,
2851 Mark Fowler,
2852 Stuart Children,
2853 Max Maischein,
2854 Meng Wong,
2855 Prakash Kailasa,
2856 Abigail,
2857 Jan Pazdziora,
2858 Dominique Quatravaux,
2859 Scott Lanning,
2860 Rob Casey,
2861 Leland Johnson,
2862 Joshua Gatcomb,
2863 Julien Beasley,
2864 Abe Timmerman,
2865 Peter Stevens,
2866 Pete Krawczyk,
2867 Tad McClellan,
2868 and the late great Iain Truskett.
2869
2870 =head1 COPYRIGHT
2871
2872 Copyright (c) 2005-2009 Andy Lester. All rights reserved. This program is
2873 free software; you can redistribute it and/or modify it under the same
2874 terms as Perl itself.
2875
2876 =cut