# libwhisker v1.7 # libwhisker is a collection of routines used by whisker # # libwhisker copyright 2000,2001,2002 rfp.labs # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # # More information can be found at http://www.wiretrip.net/rfp/ # Libwhisker mailing list and resources are also available at # http://sourceforge.net/projects/whisker/ # package LW; use 5.004; $LW::VERSION="1.7"; ####### external module tests ################################### BEGIN { ## LW module manager stuff ## %LW::available = (); $LW::LW_HAS_SOCKET = 0; $LW::LW_HAS_SSL = 0; $LW::LW_SSL_LIB = 0; $LW::LW_NONBLOCK_CONNECT= 0; ## binary helper - may contain functions substituted further down ## eval "use LW::bin"; # do we have libwhisker binary helpers? if($@){ $LW::available{'LW::bin'}=$LW::bin::VERSION; } ## encode subpackage ## eval "require MIME::Base64"; if($@){ *encode_base64 = \&encode_base64_perl; *decode_base64 = \&decode_base64_perl; } else{ # MIME::Base64 typically has faster C versions $LW::available{'mime::base64'}=$MIME::Base64::VERSION; *encode_base64 = \&MIME::Base64::encode_base64; *decode_base64 = \&MIME::Base64::decode_base64;} ## md5 subpackage ## eval "require MD5"; if(!$@){ $LW::available{'md5'}=$MD5::VERSION;} ## http subpackage ## eval "use Socket"; # do we have socket support? if($@){ $LW::LW_HAS_SOCKET=0; } else { $LW::LW_HAS_SOCKET=1; $LW::available{'socket'}=$Socket::VERSION;} if($LW_HAS_SOCKET){ eval "use Net::SSLeay"; # do we have SSL support? if($@){ $LW::LW_HAS_SSL=0; } else { $LW::LW_HAS_SSL=1; $LW::LW_SSL_LIB=1; $LW::available{'net::ssleay'}=$Net::SSLeay::VERSION; Net::SSLeay::load_error_strings(); Net::SSLeay::SSLeay_add_ssl_algorithms(); Net::SSLeay::randomize();} if(!$LW::LW_HAS_SSL){ eval "use Net::SSL"; # different SSL lib if($@){ $LW::LW_HAS_SSL=0; } else { $LW::LW_HAS_SSL=1; $LW::LW_SSL_LIB=2; $LW::available{'net::ssl'}=$Net::SSL::VERSION;} } ## non-blocking IO ## if($^O!~/Win32/){ eval "use POSIX qw(:errno_h :fcntl_h)"; # better if(!$@){ $LW::LW_NONBLOCK_CONNECT=1; } } } # if($LW_HAS_SOCKET) } # BEGIN ####### package variables ####################################### ## crawl subpackage ## %LW::crawl_config=( 'save_cookies' => 0, 'reuse_cookies' => 1, 'save_offsites' => 0, 'follow_moves' => 1, 'url_limit' => 1000, 'use_params' => 0, 'params_double_record' => 0, 'skip_ext' => '.gif .jpg .gz .mp3 .swf .zip ', 'save_skipped' => 0, 'save_referrers'=> 0, 'do_head' => 0, 'callback' => 0, 'slashdot_bug' => 1, 'normalize_uri' => 1, 'source_callback' => 0 ); @LW::crawl_urls=();; %LW::crawl_server_tags=(); %LW::crawl_referrers=(); %LW::crawl_offsites=(); %LW::crawl_cookies=(); %LW::crawl_forms=(); %LW::crawl_temp=(); # this idea/structure was taken from HTML::LinkExtor.pm, # copyright 2000 Gisle Aas and Michael A. Chase %LW::crawl_linktags = ( 'a' => 'href', 'applet' => [qw(codebase archive code)], 'area' => 'href', 'base' => 'href', 'bgsound' => 'src', 'blockquote' => 'cite', 'body' => 'background', 'del' => 'cite', 'embed' => [qw(src pluginspage)], 'form' => 'action', 'frame' => [qw(src longdesc)], 'iframe' => [qw(src longdesc)], 'ilayer' => 'background', 'img' => [qw(src lowsrc longdesc usemap)], 'input' => [qw(src usemap)], 'ins' => 'cite', 'isindex' => 'action', 'head' => 'profile', 'layer' => [qw(background src)], 'link' => 'href', 'object' => [qw(codebase data archive usemap)], 'q' => 'cite', 'script' => 'src', 'table' => 'background', 'td' => 'background', 'th' => 'background', 'xmp' => 'href', ); ## forms subpackage ## @LW::forms_found=(); %LW::forms_current=(); ## http subpackage ## my $SOCKSTATE=0; my $TIMEOUT=10; # default my ($STATS_REQS,$STATS_SYNS)=(0,0); my ($LAST_HOST,$LAST_INET_ATON,$LAST_SSL)=('','',0); my ($OUTGOING_QUEUE,$INCOMING_QUEUE)=('',''); my ($SSL_CTX, $SSL_THINGY); my %http_host_cache=(); # order is following: # [0] - SOCKET # [1] - $SOCKSTATE # [2] - INET_ATON # [3] - $SSL_CTX # [4] - $SSL_THINGY # [5] - $OUTGOING_QUEUE # [6] - $INCOMING_QUEUE # [7] - $STATS_SYNS # [8] - $STATS_REQS my $Z; # array ref to current host specs =pod =head1 ++ Sub package: anti-ids The anti-ids sub package implements management routines for various rewriting/encoding in order to evade intrusion detection systems. =cut ######################################################################## =pod =head1 - Function: LW::anti_ids Params: \%hin, $modes Return: nothing LW::anti_ids computes the proper anti-ids encoding/tricks specified by $modes, and sets up %hin in order to use those tricks. Valid modes are (the mode numbers are the same as those found in whisker 1.4): 1 - Encode some of the characters via normal URL encoding 2 - Insert directory self-references (/./) 3 - Premature URL ending (make it appear the request line is done) 4 - Prepend a long random string in the form of "/string/../URL" 5 - Add a fake URL parameter 6 - Use a tab instead of a space as a request spacer 7 - Change the case of the URL around (works against Windows and Novell) 8 - Change normal seperators ('/') to Windows version ('\') 9 - Session splicing (sending data in multiple packets) You can set multiple modes by setting the string to contain all the modes desired; i.e. $modes="146" will use modes 1, 4, and 6. =cut sub anti_ids { my ($rhin,$modes)=(shift,shift); my (@T,$x,$c,$s,$y); my $ENCODED=0; my $W = $$rhin{'whisker'}; return if(!(defined $rhin && ref($rhin))); # in case they didn't do it already $$rhin{'whisker'}->{'uri_orig'}=$$rhin{'whisker'}->{'uri'}; # note: order is important! # mode 9 - session splicing if($modes=~/9/){ $$rhin{'whisker'}->{'ids_session_splice'}=1; } # mode 4 - prepend long random string if($modes=~/4/){$s=''; if($$W{'uri'}=~m#^/#){ $y=&utils_randstr; $s.=$y while(length($s)<512); $$W{'uri'}="/$s/..".$$W{'uri'}; } } # mode 7 - (windows) random case sensitivity if($modes=~/7/){ @T=split(//,$$W{'uri'}); for($x=0;$x<(scalar @T);$x++){ if((rand()*2)%2 == 1){ $T[$x]=uc($T[$x]);}} $$W{'uri'}=join('',@T); } # mode 2 - directory self-reference (/./) if($modes=~/2/){ $$W{'uri'}=~s#/#/./#g; } # mode 8 - windows directory separator (\) if($modes=~/8/){ $$W{'uri'}=~s#/#\\#g; $$W{'uri'}=~s#^\\#/#; $$W{'uri'}=~s#^(http|file|ftp|nntp|news|telnet):\\#$1://#; $$W{'uri'}=~s#\\$#/#; } # mode 1 - random URI (non-UTF8) encoding if($modes=~/1/){ if($ENCODED==0){ $$W{'uri'}=encode_str2ruri($$W{'uri'}); $ENCODED=1;} } # mode 5 - fake parameter if($modes=~/5/){ ($s,$y)=(&utils_randstr,&utils_randstr); $$W{'uri'}="/$s.html%3f$y=/../$$W{'uri'}"; } # mode 3 - premature URL ending if($modes=~/3/){ $s=&utils_randstr; $$W{'uri'}="/%20HTTP/1.1%0D%0A%0D%0AAccept%3A%20$s/../..$$W{'uri'}"; } # mode 6 - TAB as request spacer if($modes=~/6/){ $$W{'req_spacer'}="\t"; } } # end anti_ids =pod =head1 ++ Sub package: auth The auth sub package implements HTTP authentication routines. =cut ######################################################################## =pod =head1 - Function: LW::auth_brute_force Params: $auth_method, \%hin, $user, \@passwords [, $domain] Return: $first_valid_password, undef if error/none found Perform a HTTP authentication brute force against a server (host and URI defined in %hin). It will try every password in the password array for the given user. The first password (in conjunction with the given user) that doesn't return HTTP 401 is returned (and the brute force is stopped at that point). $domain is optional, and is only used for NTLM auth. =cut sub auth_brute_force { my ($auth_method, $hrin, $user, $pwordref, $dom)=@_; my ($P,%hout); return undef if(!defined $auth_method || length($auth_method)==0); return undef if(!defined $user || length($user) ==0); return undef if(!(defined $hrin && ref($hrin) )); return undef if(!(defined $pwordref && ref($pwordref))); map { ($P=$_)=~tr/\r\n//d; auth_set_header($auth_method,$hrin,$user,$P,$dom); return undef if(http_do_request($hrin,\%hout)); return $P if($hout{'whisker'}->{'http_resp'} ne 401); } @$pwordref; return undef;} ######################################################################## =pod =head1 - Function: LW::auth_set_header Params: $auth_method, \%hin, $user, $password [, $domain] Return: nothing (modifies %hin) Set the appropriate authentication header in %hin. NOTE: right now only BASIC and NTLM are supported. =cut sub auth_set_header { my ($method, $href, $user, $pass, $domain)=(lc(shift),@_); return if(!(defined $href && ref($href))); return if(!defined $user || !defined $pass); if($method eq 'basic'){ $$href{'Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'proxy-basic'){ $$href{'Proxy-Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'ntlm'){ my $o=ntlm_new($user,$pass,$domain); $$href{'whisker'}->{'ntlm_obj'}=$o; $$href{'whisker'}->{'ntlm_step'}=0; $$href{'Authorization'}='NTLM '.ntlm_client($o); } } ######################################################################## =pod =head1 - Function: LW::do_auth Params: $auth_method, \%hin, $user, $password [, $domain] Return: nothing (modifies %hin) This is an alias for auth_set_header(). =cut sub do_auth { goto &auth_set_header; } =pod =head1 ++ Sub package: bruteurl The bruteurl sub package is used to perform a brute-force of HTTP requests on an array of string components. =cut =pod =head1 - Function: LW::bruteurl Params: \%hin, $pre, $post, \@values_in, \@values_out Return: Nothing (adds to @out) Bruteurl will perform a brute force against the host/server specified in %hin. However, it will make one request per entry in @in, taking the value and setting $hin{'whisker'}->{'uri'}= $pre.value.$post. Any URI responding with an HTTP 200 or 403 response is pushed into @out. An example of this would be to brute force usernames, putting a list of common usernames in @in, setting $pre='/~' and $post='/'. =cut sub bruteurl { my ($hin, $upre, $upost, $arin, $arout)=@_; my ($U,%hout); return if(!(defined $hin && ref($hin) )); return if(!(defined $arin && ref($arin) )); return if(!(defined $arout && ref($arout))); return if(!defined $upre || length($upre) ==0); return if(!defined $upost || length($upost)==0); http_fixup_request($hin); map { ($U=$_)=~tr/\r\n//d; next if($U eq ''); if(!http_do_request($hin,\%hout,{'uri'=>$upre.$U.$upost})){ if( $hout{'whisker'}->{'http_resp'}==200 || $hout{'whisker'}->{'http_resp'}==403){ push(@{$arout},$U); } } } @$arin; } =pod =head1 ++ Sub package: cookie Cookie handling functions. Cookies are stored in a "jar" (hash), indexed by cookie name. The contents are an anonymous array: $jar{'name'}=@( 'value', 'domain', 'path', 'expire', 'secure' ) =cut ######################################################################## =pod =head1 - Function: LW::cookie_read Params: \%jar, \%hout Return: $num_of_cookies_read Read in cookies from an %hout hash (HTTP response), and put them in %jar. =cut sub cookie_read { my ($count,$jarref,$href)=(0,@_); return 0 if(!(defined $jarref && ref($jarref))); return 0 if(!(defined $href && ref($href) )); my $target = utils_find_lowercase_key($href,'set-cookie'); if(!defined $target){ return 0;} if(ref($target)){ # multiple headers foreach (@{$target}){ cookie_parse($jarref,$_); $count++; } } else { # single header cookie_parse($jarref,$target); $count=1; } return $count; } ######################################################################## =pod =head1 - Function: LW::cookie_parse Params: \%jar, $cookie Return: nothing Parses the cookie into the various parts and then sets the appropriate values in the %jar under the name; if the cookie is blank, it will delete it from the jar. =cut sub cookie_parse { my ($jarref, $header)=@_; my ($del,$part,@parts,@construct,$cookie_name)=(0); return if(!(defined $jarref && ref($jarref))); return if(!(defined $header && length($header)>0)); @parts=split(/;/,$header); foreach $part (@parts){ if($part=~/^[ \t]*(.+?)=(.*)$/){ my ($name,$val)=($1,$2); if($name=~/^domain$/i){ $val=~s#^http://##; $val=~s#/.*$##; $construct[1]=$val; } elsif($name=~/^path$/i){ $val=~s#/$## if($val ne '/'); $construct[2]=$val; } elsif($name=~/^expires$/i){ $construct[3]=$val; } else { $cookie_name=$name; if($val eq ''){ $del=1; } else { $construct[0]=$val;} } } else { if($part=~/secure/){ $construct[4]=1;} } } if($del){ delete $$jarref{$cookie_name} if defined $$jarref{$cookie_name}; } else { $$jarref{$cookie_name}=\@construct; } } ######################################################################## =pod =head1 - Function: LW::cookie_write Params: \%jar, \%hin, $override Return: nothing Goes through the given jar and sets the Cookie header in %hin pending the correct domain and path. If $override is true, then the domain and path restrictions of the cookies are ignored. Todo: factor in expire and secure. =cut sub cookie_write { my ($jarref, $hin, $override)=@_; my ($name,$out)=('',''); return if(!(defined $jarref && ref($jarref))); return if(!(defined $hin && ref($hin) )); $override=$override||0; $$hin{'whisker'}->{'ssl'}=$$hin{'whisker'}->{'ssl'}||0; foreach $name (keys %$jarref){ next if($name eq ''); next if($$hin{'whisker'}->{'ssl'}==0 && $$jarref{$name}->[4]>0); if($override || ($$hin{'whisker'}->{'host'}=~/$$jarref{$name}->[1]$/i && $$hin{'whisker'}->{'uri'}=~/$$jarref{$name}->[2]/i)){ $out.="$name=$$jarref{$name}->[0];"; } } if($out ne ''){ $$hin{'Cookie'}=$out; } } ######################################################################## =pod =head1 - Function: LW::cookie_get Params: \%jar, $name Return: @elements Fetch the named cookie from the jar, and return the components. =cut sub cookie_get { my ($jarref,$name)=@_; return undef if(!(defined $jarref && ref($jarref))); if(defined $$jarref{$name}){ return @{$$jarref{$name}};} return undef; } ######################################################################## =pod =head1 - Function: LW::cookie_set Params: \%jar, $name, $value, $domain, $path, $expire, $secure Return: nothing Set the named cookie with the provided values into the %jar. =cut sub cookie_set { my ($jarref,$name,$value,$domain,$path,$expire,$secure)=@_; my @construct; return if(!(defined $jarref && ref($jarref))); return if($name eq ''); if($value eq ''){ delete $$jarref{$name}; return;} $path=$path||'/'; $secure=$secure||0; @construct=($value,$domain,$path,$expire,$secure); $$jarref{$name}=\@construct; } ######################################################################## =pod =head1 ++ Sub package: crawl Used for crawling a website by requesting a (start) page, reading the HTML, extracting the links, and then requesting those links--up to a specified depth. The module also allows various configuration tweaks to do such things as monitor requests for offsite URLs (pages on other hosts), track various cookies, etc. =cut ##################################################### =pod =head1 - Function: LW::crawl Params: $START, $MAX_DEPTH, \%tracking, \%hin Return: Nothing The heart of the crawl package. Will perform an HTTP crawl on the specified HOST, starting at START URI, proceeding up to MAX_DEPTH. A tracking hash reference (required) stores the results of each page (and ongoing progress). The http_in_options hash reference specifies a standard HTTP hash for use in the outgoing HTTP requests. Certain options are configurable via LW::crawl_set_config(). The tracking hash will contain all the pages visited; you can get the crawl engine to skip pages by placing them in the tracking hash ahead of time. START (first) parameter should be of the form "http://www.host.com/url". =cut sub crawl { my ($START, $MAX_DEPTH, $hrtrack, $hrin)=@_; my (%hout, %jar); my ($T, @ST, @links, @tlinks, @vals, @ERRORS)=(''); return if(!(defined $hrtrack && ref($hrtrack))); return if(!(defined $hrin && ref($hrin) )); return if(!defined $START || length($START)==0); $MAX_DEPTH||=2; # $ST[0]=HOST $ST[1]=URL $ST[2]=CWD $ST[3]=HTTPS $ST[4]=SERVER # $ST[5]=PORT $ST[6]=DEPTH @vals=utils_split_uri($START); $ST[1]=$vals[0]; # uri $ST[0]=$vals[2]; # host $ST[5]=$vals[3]; # port $ST[4]=undef; # server tag return if($ST[0] eq ''); # some various informationz... $LW::crawl_config{'host'}=$ST[0]; $LW::crawl_config{'port'}=$ST[5]; $LW::crawl_config{'start'}=$ST[1]; $$hrin{'whisker'}->{'host'}=$ST[0]; $$hrin{'whisker'}->{'port'}=$ST[5]; $$hrin{'whisker'}->{'lowercase_incoming_headers'}=1; # makes life easier http_fixup_request($hrin); # this is so callbacks can access internals via references $LW::crawl_config{'ref_links'}=\@links; $LW::crawl_config{'ref_jar'}=\%jar; $LW::crawl_config{'ref_hin'}=$hrin; $LW::crawl_config{'ref_hout'}=\%hout; %LW::crawl_referrers=(); # empty out existing referrers %LW::crawl_server_tags=(); %LW::crawl_offsites=(); %LW::crawl_cookies=(); %LW::crawl_forms=(); push @links, \@{[$ST[1],1,($vals[1] eq 'https')?1:0]}; while(@links){ my $C=shift @links; $ST[1]=$C->[0]; # url $ST[6]=$C->[1]; # depth $ST[3]=$C->[2]; # https next if(defined $$hrtrack{$ST[1]} && $$hrtrack{$ST[1]} ne '?'); if($ST[6] > $MAX_DEPTH){ $$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0); next; } $ST[2]=utils_get_dir($ST[1]); $$hrin{'whisker'}->{'uri'}=$ST[1]; $$hrin{'whisker'}->{'ssl'}=$ST[3]; my $result = crawl_do_request($hrin,\%hout); if($result==1 || $result==2){ push @ERRORS, "Error on making request for '$ST[1]': $hout{'whisker'}->{'error'}"; next; } if($result==0 || $result==4){ $$hrtrack{$ST[1]}=$hout{'whisker'}->{'http_resp'}; } if($result==3 || $result==5){ $$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0); } if(defined $hout{'server'}){ if(!defined $ST[4]){ # server tag $ST[4]=$hout{'server'}; } $LW::crawl_server_tags{$hout{'server'}}++; } if(defined $hout{'set-cookie'}){ if($LW::crawl_config{'save_cookies'}>0){ if(ref($hout{'set-cookie'})){ foreach (@{$hout{'set-cookie'}}){ $LW::crawl_cookies{$_}++; } } else { $LW::crawl_cookies{$hout{'set-cookie'}}++; } } if($LW::crawl_config{'reuse_cookies'}>0){ cookie_read(\%jar,\%hout); } } next if($result==4 || $result==5); next if(scalar @links > $LW::crawl_config{'url_limit'}); if($result==0){ # page should be parsed if($LW::crawl_config{'source_callback'} != 0 && ref($LW::crawl_config{'source_callback'})){ &{$LW::crawl_config{'source_callback'}}($hrin,\%hout); } LW::html_find_tags(\$hout{'whisker'}->{'data'}, \&crawl_extract_links_test); $LW::crawl_config{'stats_html'}++; # count how many pages we've parsed } if($result==3){ # follow the move via location header push @LW::crawl_urls, $hout{'location'}; } foreach $T (@LW::crawl_urls){ $T=~tr/\0\r\n//d; # the NULL character is a bug that's somewhere next if (length($T)==0); next if ($T=~/^javascript:/i); # stupid javascript next if ($T=~/^mailto:/i); next if ($T=~m#^([a-zA-Z]*)://# && lc($1) ne 'http' && lc($1) ne 'https'); next if ($T=~/^#/i); # fragment if($LW::crawl_config{'callback'} != 0){ next if &{$LW::crawl_config{'callback'}}($T,@ST); } push(@{$LW::crawl_referrers{$T}}, $ST[1]) if( $LW::crawl_config{'save_referrers'}>0 ); $T=utils_absolute_uri($T,$ST[1],1) if($LW::crawl_config{'normalize_uri'}>0); @vals=utils_split_uri($T); # slashdot bug: workaround for the following fsck'd html code: #
if($LW::crawl_config{'slashdot_bug'} > 0 && substr($vals[0],0,2) eq '//'){ if($ST[3]==1){ $T='https:'.$T; } else { $T='http:' .$T; } @vals=utils_split_uri($T); } # make sure URL is on same host, port, and protocol if( (defined $vals[2] && $vals[2] ne $ST[0]) || (defined $vals[3] && $vals[3] != $ST[5]) || (defined $vals[1] && ($vals[1] ne 'http' && $vals[1] ne 'https'))){ if($LW::crawl_config{'save_offsites'}>0){ $LW::crawl_offsites{utils_join_uri(@vals)}++; } next; } if(substr($vals[0],0,1) ne '/'){ $vals[0]=$ST[2].$vals[0]; } my $where=rindex($vals[0],'.'); my $EXT=''; if($where >= 0){ $EXT = substr($vals[0], $where+1, length($vals[0])-$where); } $EXT=~tr/0-9a-zA-Z//cd; # yucky chars will puke regex below if($EXT ne '' && $LW::crawl_config{'skip_ext'}=~/\.$EXT /i){ if($LW::crawl_config{'save_skipped'}>0){ $$hrtrack{$vals[0]}='?'; } next; } if(defined $vals[4] && $LW::crawl_config{'use_params'}>0){ if($LW::crawl_config{'params_double_record'}>0 && !defined $$hrtrack{$vals[0]}){ $$hrtrack{$vals[0]}='?'; } $vals[0]=$vals[0].'?'.$vals[4]; } next if(defined $$hrtrack{$vals[0]}); push @links, \@{[$vals[0],$ST[6]+1, ($vals[1] eq 'https')?1:0]}; } # foreach @LW::crawl_urls=(); # reset for next round } # while my $key; foreach $key (keys %LW::crawl_config){ delete $LW::crawl_config{$key} if (substr($key,0,4) eq 'ref_');} $LW::crawl_config{'stats_reqs'}=$hout{'whisker'}->{'stats_reqs'}; $LW::crawl_config{'stats_syns'}=$hout{'whisker'}->{'stats_syns'}; } # end sub crawl ##################################################### =pod =head1 - Function: LW::crawl_get_config Params: $config_directive Return: $config_directive_value Returns the set value of the submitted config_directive. See LW::crawl_set_config() for a list of configuration values. =cut sub crawl_get_config { my $key=shift; return $LW::crawl_config{$key}; } ##################################################### =pod =head1 - Function: LW::crawl_set_config Params: $config_directive, $value Return: Nothing This function adjusts the configuration of the crawl package. Use values of 0 and 1 for off and on, respectively. The defaults are set in libs/globals.wpl. save_cookies - crawl will save all cookies encountered, for later review save_offsite_urls - crawl will save all offsite URLs (URLs not on this host); crawl will not actually crawl those hosts (use separate calls to crawl) follow_moves - crawl will follow the URL received from an HTTP move response use_params - crawl will factor in URI parameters when considering if a URI is unique or not params_double_record - if both use_params and params_double_record are set, crawl will make two entries for each URI which has paramaters: one with and one without the parameters reuse_cookies - crawl will resubmit any received/prior cookies skip_ext - crawl will ignore requests for URLs ending in extensions given; the value requires a specific string format: (dot)extension(space). For example, to ignore GIFs and JPGs, you would run: LW::crawl_set_config('skip_ext',".gif .jpg "); save_skipped - any URLs that are skipped via skip_ext, or are above the specified DEPTH will be recorded in the tracking hash with a value of '?' (instead of an HTTP response code). callback - crawl will call this function (if this is a reference to a function), passing it the current URI and the @ST array (which has host, port, SSL, etc info). If the function returns a TRUE value, then crawl will skip that URI. Set to value 0 (zero) if you do not want to use a callback. slashdot_bug - slashdot.org uses a screwy piece of invalid (yet it works) HTML in the form of . So basically, when a URL starts with '//' and slashdot_bug is set to 1 (which it is by default), then the proper 'http:' or 'https:' will be prepended to the URL. source_callback - crawl will call this function (if this is a reference to a function), passing references to %hin and %hout, right before it parses the page for HTML links. This allows the callback function to review or modify the HTML before it's parsed for links. Return value is ignored. url_limit - number or URLs that crawl will queue up at one time; defaults to 1000 do_head - use head requests to determine if a file has a content-type worth downloading. Potentially saves some time, assuming the server properly supports HEAD requests. Set to value 1 to use (0/off by default). =cut sub crawl_set_config { return if(!defined $_[0]); my %opts=@_; while( my($k,$v)=each %opts){ $LW::crawl_config{lc($k)}=$v; } } ##################################################### =pod =head1 - Function: LW::crawl_extract_links_test (INTERNAL) Params: $TAG, \%elements, \$html_data, $offset, $len Return: nothing This is the callback function used by the crawl function, and passed to html_find_tags. It will find URL/URI links and place them in @LW::crawl_urls. =cut sub crawl_extract_links_test { my ($TAG, $hr, $dr, $start, $len)=(lc(shift),@_); my $t; # this should be most of the time... return undef if(!defined ($t=$LW::crawl_linktags{$TAG})); return undef if(!scalar %$hr); # fastpath quickie while( my ($key,$val)= each %$hr){ # normalize element values $$hr{lc($key)} = $val; } if(ref($t)){ foreach (@$t){ push(@LW::crawl_urls,$$hr{$_}) if(defined $$hr{$_}); } } else { push(@LW::crawl_urls,$$hr{$t}) if(defined $$hr{$t}); } if($TAG eq 'form' && defined $$hr{action}){ my $u=$LW::crawl_config{'ref_hout'}->{'whisker'}->{'uri'}; $LW::crawl_forms{utils_absolute_uri($$hr{action},$u,1)}++; } return undef; } ################################################################ =pod =head1 - Function: LW::crawl_make_request (INTERNAL) Params: \%hin, \%hout Return: $status_code This is an internal function used by LW::crawl(), and is responsible for making HTTP requests, including any HEAD pre-requests and following move responses. Status codes are: 0 Success 1 Error during request 2 Error on connection setup 3 Move request; follow Location header 4 File not of text/htm(l) type 5 File not available =cut sub crawl_do_request { my ($hrin,$hrout) = @_; my $ret; if($LW::crawl_config{'do_head'}){ my $save=$$hrin{'whisker'}->{'method'}; $$hrin{'whisker'}->{'method'}='HEAD'; $ret=http_do_request($hrin,$hrout); $$hrin{'whisker'}->{'method'}=$save; return 2 if($ret==2); # if there was connection error, do not continue if($ret==0){ # successful request if($$hrout{'whisker'}->{'http_resp'}==501){ # HEAD not allowed $LW::crawl_config{'do_head'}=0; # no more HEAD requests } if($$hrout{'whisker'}->{'http_resp'} <308 && $$hrout{'whisker'}->{'http_resp'} >300){ if($LW::crawl_config{'follow_moves'} >0){ return 3 if(defined $$hrout{'location'}); } return 5; # not avail } if($$hrout{'whisker'}->{'http_resp'}==200){ # no content-type is treated as text/htm if(defined $$hrout{'content-type'} && $$hrout{'content-type'}!~/^text\/htm/i){ return 4; } # fall through to GET request below } } # request errors are essentially redone via GET, below } return http_do_request($hrin,$hrout); } ##################################################### =pod =head1 ++ Sub package: dump The dump subpackage contains various utility functions which emulate the basic functionality provided by Data::Dumper. =cut ######################################################################## =pod =head1 - Function: LW::dumper Params: $name, \@array [, $name, \%hash, $name, \$scalar ] Return: $code, undef on error The dumper function will take the given $name and data reference, and will create an ASCII perl code representation suitable for eval'ing later to recreate the same structure. $name is the name of the variable that it will be saved as. Example: $output = LW::dumper('hin',\%hin); NOTE: dumper() creates anonymous structures under the name given. For example, if you dump the hash %hin under the name 'hin', then when you eval the dumped code you will need to use %$hin, since $hin is now a *reference* to a hash. =cut sub dumper { my %what=@_; my ($final,$k,$v)=(''); while( ($k,$v)=each %what){ return undef if(ref($k) || !ref($v)); $final.="\$$k = "._dump(1,$v,1); $final=~s#,\n$##; $final.=";\n"; } return $final; } ######################################################################## =pod =head1 - Function: LW::dumper_writefile Params: $file, $name, \@array [, $name, \%hash, $name, \@scalar ] Return: 0 if success; 1 if error This calls dumper() and saves the output to the specified $file. Note: LW does not checking on the validity of the file name, it's creation, or anything of the sort. Files are opened in overwrite mode. =cut sub dumper_writefile { my $file=shift; my $output=dumper(@_); return 1 if(!open(OUT,">$file") || $output eq 'ERROR'); print OUT $output; close(OUT); } ######################################################################## =pod =head1 - Function: LW::_dump (INTERNAL) Params: $tabs, $ref Return: $output This is an internal function to dumper() which will dereference all elements and produce the resulting code. This function is not intended for external use. =cut sub _dump { # dereference and dump an element my ($t, $ref, $depth)=@_; my ($out,$k,$v)=(''); $depth||=1; # to protect against circular loops return 'undef' if($depth > 128); if(!defined $ref){ return 'undef'; } elsif(ref($ref) eq 'HASH'){ $out.="{\n"; while( ($k,$v)=each %$ref){ next if($k eq ''); $out.= "\t"x$t; $out.=_dumpd($k).' => '; if(ref($v)){ $out.=_dump($t+1,$v,$depth+1); } else { $out.=_dumpd($v); } $out.=",\n" unless( substr($out,-2,2) eq ",\n"); } $out=~s#,\n$#\n#; $out.="\t"x($t-1); $out.="},\n"; } elsif(ref($ref) eq 'ARRAY'){ $out.="["; if(~~@$ref){ $out.="\n"; foreach $v (@$ref) { $out.= "\t"x$t; if(ref($v)){ $out.=_dump($t+1,$v,$depth+1); } else { $out.=_dumpd($v); } $out.=",\n" unless( substr($out,-2,2) eq ",\n"); } $out=~s#,\n$#\n#; $out.="\t"x($t-1); } $out.="],\n"; } elsif(ref($ref) eq 'SCALAR'){ $out.=_dumpd($$ref); } elsif(ref($ref) eq 'REF'){ $out.=_dump($t,$$ref,$depth+1); } elsif(ref($ref)){ # unknown/unsupported ref $out.="undef"; } else { # normal scalar $out.=_dumpd($ref); } return $out; } ######################################################################## =pod =head1 - Function: LW::_dumpd (INTERNAL) Params: $string Return: $escaped_string This is an internal function to dumper() which will escape the given string to make it suitable for printing. This function is not intended for external use. =cut sub _dumpd { # escape a scalar string my $v=shift; return 'undef' if(!defined $v); return "''" if($v eq ''); return "$v" if($v!~tr/0-9//c); return "'$v'" if($v!~tr/ !-~//c); $v=~s#\\#\\\\#g; $v=~s#"#\\"#g; $v=~s#\r#\\r#g; $v=~s#\n#\\n#g; $v=~s#\0#\\0#g; $v=~s#\t#\\t#g; $v=~s#([^!-~ ])#sprintf('\\x%02x',ord($1))#eg; return "\"$v\""; } ######################################################################## =pod =head1 ++ Sub package: easy The 'easy' subpackage contains many high-level/simple functions to do basic web tasks. This should make it easier to use libwhisker to do basic tasks. =cut ######################################################################## =pod =head1 - Function: LW::get_page Params: $url [, \%hin_request] Return: $code, $data ($code will be set to undef on error, $data will contain error message) This function will fetch the page at the given URL, and return the HTTP response code and page contents. Use this in the form of: ($code,$html)=LW::get_page("http://host.com/page.html") The optional %hin_request will be used if supplied. This allows you to set headers and other parameters. =cut sub get_page { my ($URL,$hr)=(shift,shift); return (undef,"No URL supplied") if(length($URL)==0); my (%req,%resp); my $rptr; if(defined $hr && ref($hr)){ $rptr=$hr; } else { $rptr=\%req; LW::http_init_request(\%req); } LW::utils_split_uri($URL,$rptr); # this is newer >=1.1 syntax LW::http_fixup_request($rptr); if(http_do_request($rptr,\%resp)){ return (undef,$resp{'whisker'}->{'error'}); } return ($resp{'whisker'}->{'code'}, $resp{'whisker'}->{'data'}); } ######################################################################## =pod =head1 - Function: LW::get_page_hash Params: $url [, \%hin_request] Return: $hash_ref (undef on no URL) This function will fetch the page at the given URL, and return the whisker HTTP response hash. The return code of the function is set to $hash_ref->{whisker}->{get_page_hash}, and uses the LW::http_do_request() response values. Note: undef is returned if no URL is supplied =cut sub get_page_hash { my ($URL,$hr)=(shift,shift); return undef if(length($URL)==0); my (%req,%resp); my $rptr; if(defined $hr && ref($hr)){ $rptr=$hr; } else { $rptr=\%req; LW::http_init_request(\%req); } LW::utils_split_uri($URL,$rptr); # this is newer >=1.1 syntax LW::http_fixup_request($rptr); my $r=http_do_request($rptr,\%resp); $resp{whisker}->{get_page_hash}=$r; return \%resp; } ######################################################################## =pod =head1 - Function: LW::get_page_to_file Params: $url, $filepath [, \%hin_request] Return: $code ($code will be set to undef on error) This function will fetch the page at the given URL, place the resulting HTML in the file specified, and return the HTTP response code. The optional %hin_request hash sets the default parameters to be used in the request. NOTE: libwhisker does not do any file checking; libwhisker will open the supplied filepath for writing, overwriting any previously-existing files. Libwhisker does not differentiate between a bad request, and a bad file open. If you're having troubles making this function work, make sure that your $filepath is legal and valid, and that you have appropriate write permissions to create/overwrite that file. =cut sub get_page_to_file { my ($URL, $filepath, $hr)=@_; return undef if(length($URL)==0); return undef if(length($filepath)==0); my (%req,%resp); my $rptr; if(defined $hr && ref($hr)){ $rptr=$hr; } else { $rptr=\%req; LW::http_init_request(\%req); } LW::utils_split_uri($URL,$rptr); # this is newer >=1.1 syntax LW::http_fixup_request($rptr); if(http_do_request($rptr,\%resp)){ return undef; } open(OUT,">$filepath") || return undef; binmode(OUT); # stupid Windows print OUT $resp{'whisker'}->{'data'}; close(OUT); return $resp{'whisker'}->{'code'}; } ######################################################################## =pod =head1 - Function: LW::upload_file Params: $url, $filepath, $paramname [, \%hin_request] Return: $code ($code will be set to undef on error) This function will upload the specified $file to the given $url as the parameter named $paramname via a multipart POST request. The optional $hin_request hash lets you set any other particular request parameters. NOTE: this is a highly simplied function for basic uploads. If you need to do more advanced things like set other multipart form parameters, send multiple files, etc, then you will need to use the normal API to do it yourself. =cut sub upload_file { my ($URL, $filepath, $paramname, $hr)=@_; return undef if(length($URL) ==0); return undef if(length($filepath) ==0); return undef if(length($paramname)==0); return undef if(!(-e $filepath && -f $filepath)); my (%req,%resp,%multi); my $rptr; if(defined $hr && ref($hr)){ $rptr=$hr; } else { $rptr=\%req; LW::http_init_request(\%req); } LW::utils_split_uri($URL,$rptr); # this is newer >=1.1 syntax $rptr{'whisker'}->{'method'}='POST'; LW::http_fixup_request($rptr); LW::multipart_setfile(\%multi,$filepath,$paramname); LW::multipart_write(\%multi,$rptr); if(http_do_request($rptr,\%resp)){ return undef; } return $resp{'whisker'}->{'code'}; } ######################################################################## =pod =head1 - Function: LW::download_file Params: $url, $filepath [, \%hin_request] Return: $code ($code will be set to undef on error) LW::download_file is just an alias for LW::get_page_to_file(). =cut sub download_file { goto &LW::get_page_to_file; } ######################################################################## =pod =head1 ++ Sub package: encode Various type encodings. Installing MIME::Base64 will result in a compiled C version of base64 functions, which means they will be tons faster. This is useful if you're going to run a Basic authentication brute force, which requires a high processing speed. However, it's not required, since I include a Perl version, which is slower. =cut ######################################################################## =pod =head1 - Function: LW::encode_base64 Params: $data, $eol Return: $base64_encoded_data LW::encode_base64 is a stub function which will choose the fastest function available for doing base64 encoding. This is done by checking to see if the MIME::Base64 perl module is available (which uses fast C routines). If it's not, then it defaults to a perl version (which is slower). You can call the perl version direct, but I suggest using the stub to gain speed advantages where possible. =cut #sub encode_base64; ######################################################################## =pod =head1 - Function: LW::decode_base64 Params: $data Return: $base64_decoded_data LW::decode_base64 is a stub function which will choose the fastest function available for doing base64 decoding. This is done by checking to see if the MIME::Base64 perl module is available (which uses fast C routines). If it's not, then it defaults to a perl version (which is slower). You can call the perl version direct, but I suggest using the stub to gain speed advantages where possible. =cut #sub decode_base64; ######################################################################## =pod =head1 - Function: LW::encode_base64_perl Params: $data, $eol Return: $b64_encoded_data A perl implementation of base64 encoding. I recommend you use LW::encode_base64 instead, since it may use the MIME::Base64 module (if available), which lead to speed advantages. The perl code for this function was actually taken from an older MIME::Base64 perl module, and bears the following copyright: Copyright 1995-1999 Gisle Aas NOTE: the $eol parameter will be inserted every 76 characters. This is used to format the data for output on a 80 character wide terminal. =cut sub encode_base64_perl { # ripped from MIME::Base64 my $res = ""; my $eol = $_[1]; $eol = "\n" unless defined $eol; pos($_[0]) = 0; while ($_[0] =~ /(.{1,45})/gs) { $res .= substr(pack('u', $1), 1); chop($res);} $res =~ tr|` -_|AA-Za-z0-9+/|; my $padding = (3 - length($_[0]) % 3) % 3; $res =~ s/.{$padding}$/'=' x $padding/e if $padding; if (length $eol) { $res =~ s/(.{1,76})/$1$eol/g; } $res; } ######################################################################## =pod =head1 - Function: LW::decode_base64_perl Params: $data Return: $b64_decoded_data A perl implementation of base64 decoding. The perl code for this function was actually taken from an older MIME::Base64 perl module, and bears the following copyright: Copyright 1995-1999 Gisle Aas =cut sub decode_base64_perl { # ripped from MIME::Base64 my $str = shift; my $res = ""; $str =~ tr|A-Za-z0-9+=/||cd; $str =~ s/=+$//; # remove padding $str =~ tr|A-Za-z0-9+/| -_|; # convert to uuencoded format while ($str =~ /(.{1,60})/gs) { my $len = chr(32 + length($1)*3/4); # compute length byte $res .= unpack("u", $len . $1 ); # uudecode }$res;} ######################################################################## =pod =head1 - Function: LW::encode_str2uri Params: $data Return: $result This function encodes every character (except the / character) with normal URL hex encoding. =cut sub encode_str2uri { # normal hex encoding my $str=shift; $str=~s/([^\/])/sprintf("%%%02x",ord($1))/ge; return $str;} ######################################################################### =pod =head1 - Function: LW::encode_str2ruri Params: $data Return: $result This function randomly encodes characters (except the / character) with normal URL hex encoding. =cut sub encode_str2ruri { # random normal hex encoding my @T=split(//,shift); my $s; foreach (@T) { if(m#;=:&@\?#){ $s.=$_; next; } if((rand()*2)%2 == 1){ $s.=sprintf("%%%02x",ord($_)) ; }else{ $s.=$_; } } return $s; } ######################################################################### =pod =head1 - Function: LW::encode_unicode Params: $data Return: $result This function converts a normal string into Windows unicode format. =cut sub encode_unicode { my $r=''; foreach $c (split(//,shift)){ $r.=pack("v",ord($c)); } return $r; } ######################################################################### =pod =head1 ++ Sub package: forms This subpackage contains various routines to parse and handle HTML forms. The goal is to parse the variable, human-readable HTML into concrete structures useable by your program. The forms package does do a good job at making these structures, but I will admit: they are not exactly simple, and thus not a cinch to work with. But then again, representing something as complex as a HTML form is not a simple thing either. I think the results are acceptable for what's trying to be done. Anyways... Forms are stored in perl hashes, with elements in the following format: $form{'element_name'}=@([ 'type', 'value', @params ]) Thus every element in the hash is an array of anonymous arrays. The first array value contains the element type (which is 'select', 'textarea', 'button', or an 'input' value of the form 'input-text', 'input-hidden', 'input-radio', etc). The second value is the value, if applicable (it could be undef if no value was specified). Note that select elements will always have an undef value--the actual values are in the subsequent options elements. The third value, if defined, is an anonymous array of additional tag parameters found in the element (like 'onchange="blah"', 'size="20"', 'maxlength="40"', 'selected', etc). The array does contain one special element, which is stored in the hash under a NULL character ("\0") key. This element is of the format: $form{"\0"}=['name', 'method', 'action', @parameters]; The element is an anonymous array that contains strings of the form's name, method, and action (values can be undef), and a @parameters array similar to that found in normal elements (above). Accessing individual values stored in the form hash becomes a test of your perl referencing skills. Hint: to access the 'value' of the third element named 'choices', you would need to do: $form{'choices'}->[2]->[1]; The '[2]' is the third element (normal array starts with 0), and the actual value is '[1]' (the type is '[0]', and the parameter array is '[2]'). =cut ################################################################ =pod =head1 - Function: LW::forms_read Params: \$html_data Return: @found_forms This function parses the given $html_data into libwhisker form hashes. It returns an array of hash references to the found forms. =cut sub forms_read { my $dr=shift; return undef if(!ref($dr) || length($$dr)==0); @LW::forms_found=(); LW::html_find_tags($dr,\&forms_parse_callback); if(scalar %LW::forms_current){ my %DUP=%LW::forms_current; push(@LW::forms_found,\%DUP); } return @LW::forms_found; } ################################################################ =pod =head1 - Function: LW::forms_write Params: \%form_hash Return: $html_of_form [undef on error] This function will take the given %form hash and compose a generic HTML representation of it, formatted with tabs and newlines in order to make it neat and tidy for printing. Note: this function does *not* escape any special characters that were embedded in the element values. =cut sub forms_write { my $hr=shift; return undef if(!ref($hr) || !(scalar %$hr)); return undef if(!defined $$hr{"\0"}); my $t='[0].'" method="'; $t.=$$hr{"\0"}->[1].'" action="'.$$hr{"\0"}->[2].'"'; if(defined $$hr{"\0"}->[3]){ $t.=' '.join(' ',@{$$hr{"\0"}->[3]}); } $t.=">\n"; while( my($name,$ar)=each(%$hr) ){ next if($name eq "\0"); foreach $a (@$ar){ my $P=''; $P=' '.join(' ', @{$$a[2]}) if(defined $$a[2]); $t.="\t"; if($$a[0] eq 'textarea'){ $t.="\n"; } elsif($$a[0]=~m/^input-(.+)$/){ $t.="\n"; } elsif($$a[0] eq 'option'){ $t.="\t