# libwhisker v1.7 # libwhisker is a collection of routines used by whisker # # libwhisker copyright 2000,2001,2002 rfp.labs # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # # More information can be found at http://www.wiretrip.net/rfp/ # Libwhisker mailing list and resources are also available at # http://sourceforge.net/projects/whisker/ # package LW; use 5.004; $LW::VERSION="1.7"; ####### external module tests ################################### BEGIN { ## LW module manager stuff ## %LW::available = (); $LW::LW_HAS_SOCKET = 0; $LW::LW_HAS_SSL = 0; $LW::LW_SSL_LIB = 0; $LW::LW_NONBLOCK_CONNECT= 0; ## binary helper - may contain functions substituted further down ## eval "use LW::bin"; # do we have libwhisker binary helpers? if($@){ $LW::available{'LW::bin'}=$LW::bin::VERSION; } ## encode subpackage ## eval "require MIME::Base64"; if($@){ *encode_base64 = \&encode_base64_perl; *decode_base64 = \&decode_base64_perl; } else{ # MIME::Base64 typically has faster C versions $LW::available{'mime::base64'}=$MIME::Base64::VERSION; *encode_base64 = \&MIME::Base64::encode_base64; *decode_base64 = \&MIME::Base64::decode_base64;} ## md5 subpackage ## eval "require MD5"; if(!$@){ $LW::available{'md5'}=$MD5::VERSION;} ## http subpackage ## eval "use Socket"; # do we have socket support? if($@){ $LW::LW_HAS_SOCKET=0; } else { $LW::LW_HAS_SOCKET=1; $LW::available{'socket'}=$Socket::VERSION;} if($LW_HAS_SOCKET){ eval "use Net::SSLeay"; # do we have SSL support? if($@){ $LW::LW_HAS_SSL=0; } else { $LW::LW_HAS_SSL=1; $LW::LW_SSL_LIB=1; $LW::available{'net::ssleay'}=$Net::SSLeay::VERSION; Net::SSLeay::load_error_strings(); Net::SSLeay::SSLeay_add_ssl_algorithms(); Net::SSLeay::randomize();} if(!$LW::LW_HAS_SSL){ eval "use Net::SSL"; # different SSL lib if($@){ $LW::LW_HAS_SSL=0; } else { $LW::LW_HAS_SSL=1; $LW::LW_SSL_LIB=2; $LW::available{'net::ssl'}=$Net::SSL::VERSION;} } ## non-blocking IO ## if($^O!~/Win32/){ eval "use POSIX qw(:errno_h :fcntl_h)"; # better if(!$@){ $LW::LW_NONBLOCK_CONNECT=1; } } } # if($LW_HAS_SOCKET) } # BEGIN ####### package variables ####################################### ## crawl subpackage ## %LW::crawl_config=( 'save_cookies' => 0, 'reuse_cookies' => 1, 'save_offsites' => 0, 'follow_moves' => 1, 'url_limit' => 1000, 'use_params' => 0, 'params_double_record' => 0, 'skip_ext' => '.gif .jpg .gz .mp3 .swf .zip ', 'save_skipped' => 0, 'save_referrers'=> 0, 'do_head' => 0, 'callback' => 0, 'slashdot_bug' => 1, 'normalize_uri' => 1, 'source_callback' => 0 ); @LW::crawl_urls=();; %LW::crawl_server_tags=(); %LW::crawl_referrers=(); %LW::crawl_offsites=(); %LW::crawl_cookies=(); %LW::crawl_forms=(); %LW::crawl_temp=(); # this idea/structure was taken from HTML::LinkExtor.pm, # copyright 2000 Gisle Aas and Michael A. Chase %LW::crawl_linktags = ( 'a' => 'href', 'applet' => [qw(codebase archive code)], 'area' => 'href', 'base' => 'href', 'bgsound' => 'src', 'blockquote' => 'cite', 'body' => 'background', 'del' => 'cite', 'embed' => [qw(src pluginspage)], 'form' => 'action', 'frame' => [qw(src longdesc)], 'iframe' => [qw(src longdesc)], 'ilayer' => 'background', 'img' => [qw(src lowsrc longdesc usemap)], 'input' => [qw(src usemap)], 'ins' => 'cite', 'isindex' => 'action', 'head' => 'profile', 'layer' => [qw(background src)], 'link' => 'href', 'object' => [qw(codebase data archive usemap)], 'q' => 'cite', 'script' => 'src', 'table' => 'background', 'td' => 'background', 'th' => 'background', 'xmp' => 'href', ); ## forms subpackage ## @LW::forms_found=(); %LW::forms_current=(); ## http subpackage ## my $SOCKSTATE=0; my $TIMEOUT=10; # default my ($STATS_REQS,$STATS_SYNS)=(0,0); my ($LAST_HOST,$LAST_INET_ATON,$LAST_SSL)=('','',0); my ($OUTGOING_QUEUE,$INCOMING_QUEUE)=('',''); my ($SSL_CTX, $SSL_THINGY); my %http_host_cache=(); # order is following: # [0] - SOCKET # [1] - $SOCKSTATE # [2] - INET_ATON # [3] - $SSL_CTX # [4] - $SSL_THINGY # [5] - $OUTGOING_QUEUE # [6] - $INCOMING_QUEUE # [7] - $STATS_SYNS # [8] - $STATS_REQS my $Z; # array ref to current host specs =pod =head1 ++ Sub package: anti-ids The anti-ids sub package implements management routines for various rewriting/encoding in order to evade intrusion detection systems. =cut ######################################################################## =pod =head1 - Function: LW::anti_ids Params: \%hin, $modes Return: nothing LW::anti_ids computes the proper anti-ids encoding/tricks specified by $modes, and sets up %hin in order to use those tricks. Valid modes are (the mode numbers are the same as those found in whisker 1.4): 1 - Encode some of the characters via normal URL encoding 2 - Insert directory self-references (/./) 3 - Premature URL ending (make it appear the request line is done) 4 - Prepend a long random string in the form of "/string/../URL" 5 - Add a fake URL parameter 6 - Use a tab instead of a space as a request spacer 7 - Change the case of the URL around (works against Windows and Novell) 8 - Change normal seperators ('/') to Windows version ('\') 9 - Session splicing (sending data in multiple packets) You can set multiple modes by setting the string to contain all the modes desired; i.e. $modes="146" will use modes 1, 4, and 6. =cut sub anti_ids { my ($rhin,$modes)=(shift,shift); my (@T,$x,$c,$s,$y); my $ENCODED=0; my $W = $$rhin{'whisker'}; return if(!(defined $rhin && ref($rhin))); # in case they didn't do it already $$rhin{'whisker'}->{'uri_orig'}=$$rhin{'whisker'}->{'uri'}; # note: order is important! # mode 9 - session splicing if($modes=~/9/){ $$rhin{'whisker'}->{'ids_session_splice'}=1; } # mode 4 - prepend long random string if($modes=~/4/){$s=''; if($$W{'uri'}=~m#^/#){ $y=&utils_randstr; $s.=$y while(length($s)<512); $$W{'uri'}="/$s/..".$$W{'uri'}; } } # mode 7 - (windows) random case sensitivity if($modes=~/7/){ @T=split(//,$$W{'uri'}); for($x=0;$x<(scalar @T);$x++){ if((rand()*2)%2 == 1){ $T[$x]=uc($T[$x]);}} $$W{'uri'}=join('',@T); } # mode 2 - directory self-reference (/./) if($modes=~/2/){ $$W{'uri'}=~s#/#/./#g; } # mode 8 - windows directory separator (\) if($modes=~/8/){ $$W{'uri'}=~s#/#\\#g; $$W{'uri'}=~s#^\\#/#; $$W{'uri'}=~s#^(http|file|ftp|nntp|news|telnet):\\#$1://#; $$W{'uri'}=~s#\\$#/#; } # mode 1 - random URI (non-UTF8) encoding if($modes=~/1/){ if($ENCODED==0){ $$W{'uri'}=encode_str2ruri($$W{'uri'}); $ENCODED=1;} } # mode 5 - fake parameter if($modes=~/5/){ ($s,$y)=(&utils_randstr,&utils_randstr); $$W{'uri'}="/$s.html%3f$y=/../$$W{'uri'}"; } # mode 3 - premature URL ending if($modes=~/3/){ $s=&utils_randstr; $$W{'uri'}="/%20HTTP/1.1%0D%0A%0D%0AAccept%3A%20$s/../..$$W{'uri'}"; } # mode 6 - TAB as request spacer if($modes=~/6/){ $$W{'req_spacer'}="\t"; } } # end anti_ids =pod =head1 ++ Sub package: auth The auth sub package implements HTTP authentication routines. =cut ######################################################################## =pod =head1 - Function: LW::auth_brute_force Params: $auth_method, \%hin, $user, \@passwords [, $domain] Return: $first_valid_password, undef if error/none found Perform a HTTP authentication brute force against a server (host and URI defined in %hin). It will try every password in the password array for the given user. The first password (in conjunction with the given user) that doesn't return HTTP 401 is returned (and the brute force is stopped at that point). $domain is optional, and is only used for NTLM auth. =cut sub auth_brute_force { my ($auth_method, $hrin, $user, $pwordref, $dom)=@_; my ($P,%hout); return undef if(!defined $auth_method || length($auth_method)==0); return undef if(!defined $user || length($user) ==0); return undef if(!(defined $hrin && ref($hrin) )); return undef if(!(defined $pwordref && ref($pwordref))); map { ($P=$_)=~tr/\r\n//d; auth_set_header($auth_method,$hrin,$user,$P,$dom); return undef if(http_do_request($hrin,\%hout)); return $P if($hout{'whisker'}->{'http_resp'} ne 401); } @$pwordref; return undef;} ######################################################################## =pod =head1 - Function: LW::auth_set_header Params: $auth_method, \%hin, $user, $password [, $domain] Return: nothing (modifies %hin) Set the appropriate authentication header in %hin. NOTE: right now only BASIC and NTLM are supported. =cut sub auth_set_header { my ($method, $href, $user, $pass, $domain)=(lc(shift),@_); return if(!(defined $href && ref($href))); return if(!defined $user || !defined $pass); if($method eq 'basic'){ $$href{'Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'proxy-basic'){ $$href{'Proxy-Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'ntlm'){ my $o=ntlm_new($user,$pass,$domain); $$href{'whisker'}->{'ntlm_obj'}=$o; $$href{'whisker'}->{'ntlm_step'}=0; $$href{'Authorization'}='NTLM '.ntlm_client($o); } } ######################################################################## =pod =head1 - Function: LW::do_auth Params: $auth_method, \%hin, $user, $password [, $domain] Return: nothing (modifies %hin) This is an alias for auth_set_header(). =cut sub do_auth { goto &auth_set_header; } =pod =head1 ++ Sub package: bruteurl The bruteurl sub package is used to perform a brute-force of HTTP requests on an array of string components. =cut =pod =head1 - Function: LW::bruteurl Params: \%hin, $pre, $post, \@values_in, \@values_out Return: Nothing (adds to @out) Bruteurl will perform a brute force against the host/server specified in %hin. However, it will make one request per entry in @in, taking the value and setting $hin{'whisker'}->{'uri'}= $pre.value.$post. Any URI responding with an HTTP 200 or 403 response is pushed into @out. An example of this would be to brute force usernames, putting a list of common usernames in @in, setting $pre='/~' and $post='/'. =cut sub bruteurl { my ($hin, $upre, $upost, $arin, $arout)=@_; my ($U,%hout); return if(!(defined $hin && ref($hin) )); return if(!(defined $arin && ref($arin) )); return if(!(defined $arout && ref($arout))); return if(!defined $upre || length($upre) ==0); return if(!defined $upost || length($upost)==0); http_fixup_request($hin); map { ($U=$_)=~tr/\r\n//d; next if($U eq ''); if(!http_do_request($hin,\%hout,{'uri'=>$upre.$U.$upost})){ if( $hout{'whisker'}->{'http_resp'}==200 || $hout{'whisker'}->{'http_resp'}==403){ push(@{$arout},$U); } } } @$arin; } =pod =head1 ++ Sub package: cookie Cookie handling functions. Cookies are stored in a "jar" (hash), indexed by cookie name. The contents are an anonymous array: $jar{'name'}=@( 'value', 'domain', 'path', 'expire', 'secure' ) =cut ######################################################################## =pod =head1 - Function: LW::cookie_read Params: \%jar, \%hout Return: $num_of_cookies_read Read in cookies from an %hout hash (HTTP response), and put them in %jar. =cut sub cookie_read { my ($count,$jarref,$href)=(0,@_); return 0 if(!(defined $jarref && ref($jarref))); return 0 if(!(defined $href && ref($href) )); my $target = utils_find_lowercase_key($href,'set-cookie'); if(!defined $target){ return 0;} if(ref($target)){ # multiple headers foreach (@{$target}){ cookie_parse($jarref,$_); $count++; } } else { # single header cookie_parse($jarref,$target); $count=1; } return $count; } ######################################################################## =pod =head1 - Function: LW::cookie_parse Params: \%jar, $cookie Return: nothing Parses the cookie into the various parts and then sets the appropriate values in the %jar under the name; if the cookie is blank, it will delete it from the jar. =cut sub cookie_parse { my ($jarref, $header)=@_; my ($del,$part,@parts,@construct,$cookie_name)=(0); return if(!(defined $jarref && ref($jarref))); return if(!(defined $header && length($header)>0)); @parts=split(/;/,$header); foreach $part (@parts){ if($part=~/^[ \t]*(.+?)=(.*)$/){ my ($name,$val)=($1,$2); if($name=~/^domain$/i){ $val=~s#^http://##; $val=~s#/.*$##; $construct[1]=$val; } elsif($name=~/^path$/i){ $val=~s#/$## if($val ne '/'); $construct[2]=$val; } elsif($name=~/^expires$/i){ $construct[3]=$val; } else { $cookie_name=$name; if($val eq ''){ $del=1; } else { $construct[0]=$val;} } } else { if($part=~/secure/){ $construct[4]=1;} } } if($del){ delete $$jarref{$cookie_name} if defined $$jarref{$cookie_name}; } else { $$jarref{$cookie_name}=\@construct; } } ######################################################################## =pod =head1 - Function: LW::cookie_write Params: \%jar, \%hin, $override Return: nothing Goes through the given jar and sets the Cookie header in %hin pending the correct domain and path. If $override is true, then the domain and path restrictions of the cookies are ignored. Todo: factor in expire and secure. =cut sub cookie_write { my ($jarref, $hin, $override)=@_; my ($name,$out)=('',''); return if(!(defined $jarref && ref($jarref))); return if(!(defined $hin && ref($hin) )); $override=$override||0; $$hin{'whisker'}->{'ssl'}=$$hin{'whisker'}->{'ssl'}||0; foreach $name (keys %$jarref){ next if($name eq ''); next if($$hin{'whisker'}->{'ssl'}==0 && $$jarref{$name}->[4]>0); if($override || ($$hin{'whisker'}->{'host'}=~/$$jarref{$name}->[1]$/i && $$hin{'whisker'}->{'uri'}=~/$$jarref{$name}->[2]/i)){ $out.="$name=$$jarref{$name}->[0];"; } } if($out ne ''){ $$hin{'Cookie'}=$out; } } ######################################################################## =pod =head1 - Function: LW::cookie_get Params: \%jar, $name Return: @elements Fetch the named cookie from the jar, and return the components. =cut sub cookie_get { my ($jarref,$name)=@_; return undef if(!(defined $jarref && ref($jarref))); if(defined $$jarref{$name}){ return @{$$jarref{$name}};} return undef; } ######################################################################## =pod =head1 - Function: LW::cookie_set Params: \%jar, $name, $value, $domain, $path, $expire, $secure Return: nothing Set the named cookie with the provided values into the %jar. =cut sub cookie_set { my ($jarref,$name,$value,$domain,$path,$expire,$secure)=@_; my @construct; return if(!(defined $jarref && ref($jarref))); return if($name eq ''); if($value eq ''){ delete $$jarref{$name}; return;} $path=$path||'/'; $secure=$secure||0; @construct=($value,$domain,$path,$expire,$secure); $$jarref{$name}=\@construct; } ######################################################################## =pod =head1 ++ Sub package: crawl Used for crawling a website by requesting a (start) page, reading the HTML, extracting the links, and then requesting those links--up to a specified depth. The module also allows various configuration tweaks to do such things as monitor requests for offsite URLs (pages on other hosts), track various cookies, etc. =cut ##################################################### =pod =head1 - Function: LW::crawl Params: $START, $MAX_DEPTH, \%tracking, \%hin Return: Nothing The heart of the crawl package. Will perform an HTTP crawl on the specified HOST, starting at START URI, proceeding up to MAX_DEPTH. A tracking hash reference (required) stores the results of each page (and ongoing progress). The http_in_options hash reference specifies a standard HTTP hash for use in the outgoing HTTP requests. Certain options are configurable via LW::crawl_set_config(). The tracking hash will contain all the pages visited; you can get the crawl engine to skip pages by placing them in the tracking hash ahead of time. START (first) parameter should be of the form "http://www.host.com/url". =cut sub crawl { my ($START, $MAX_DEPTH, $hrtrack, $hrin)=@_; my (%hout, %jar); my ($T, @ST, @links, @tlinks, @vals, @ERRORS)=(''); return if(!(defined $hrtrack && ref($hrtrack))); return if(!(defined $hrin && ref($hrin) )); return if(!defined $START || length($START)==0); $MAX_DEPTH||=2; # $ST[0]=HOST $ST[1]=URL $ST[2]=CWD $ST[3]=HTTPS $ST[4]=SERVER # $ST[5]=PORT $ST[6]=DEPTH @vals=utils_split_uri($START); $ST[1]=$vals[0]; # uri $ST[0]=$vals[2]; # host $ST[5]=$vals[3]; # port $ST[4]=undef; # server tag return if($ST[0] eq ''); # some various informationz... $LW::crawl_config{'host'}=$ST[0]; $LW::crawl_config{'port'}=$ST[5]; $LW::crawl_config{'start'}=$ST[1]; $$hrin{'whisker'}->{'host'}=$ST[0]; $$hrin{'whisker'}->{'port'}=$ST[5]; $$hrin{'whisker'}->{'lowercase_incoming_headers'}=1; # makes life easier http_fixup_request($hrin); # this is so callbacks can access internals via references $LW::crawl_config{'ref_links'}=\@links; $LW::crawl_config{'ref_jar'}=\%jar; $LW::crawl_config{'ref_hin'}=$hrin; $LW::crawl_config{'ref_hout'}=\%hout; %LW::crawl_referrers=(); # empty out existing referrers %LW::crawl_server_tags=(); %LW::crawl_offsites=(); %LW::crawl_cookies=(); %LW::crawl_forms=(); push @links, \@{[$ST[1],1,($vals[1] eq 'https')?1:0]}; while(@links){ my $C=shift @links; $ST[1]=$C->[0]; # url $ST[6]=$C->[1]; # depth $ST[3]=$C->[2]; # https next if(defined $$hrtrack{$ST[1]} && $$hrtrack{$ST[1]} ne '?'); if($ST[6] > $MAX_DEPTH){ $$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0); next; } $ST[2]=utils_get_dir($ST[1]); $$hrin{'whisker'}->{'uri'}=$ST[1]; $$hrin{'whisker'}->{'ssl'}=$ST[3]; my $result = crawl_do_request($hrin,\%hout); if($result==1 || $result==2){ push @ERRORS, "Error on making request for '$ST[1]': $hout{'whisker'}->{'error'}"; next; } if($result==0 || $result==4){ $$hrtrack{$ST[1]}=$hout{'whisker'}->{'http_resp'}; } if($result==3 || $result==5){ $$hrtrack{$ST[1]}='?' if($LW::crawl_config{'save_skipped'}>0); } if(defined $hout{'server'}){ if(!defined $ST[4]){ # server tag $ST[4]=$hout{'server'}; } $LW::crawl_server_tags{$hout{'server'}}++; } if(defined $hout{'set-cookie'}){ if($LW::crawl_config{'save_cookies'}>0){ if(ref($hout{'set-cookie'})){ foreach (@{$hout{'set-cookie'}}){ $LW::crawl_cookies{$_}++; } } else { $LW::crawl_cookies{$hout{'set-cookie'}}++; } } if($LW::crawl_config{'reuse_cookies'}>0){ cookie_read(\%jar,\%hout); } } next if($result==4 || $result==5); next if(scalar @links > $LW::crawl_config{'url_limit'}); if($result==0){ # page should be parsed if($LW::crawl_config{'source_callback'} != 0 && ref($LW::crawl_config{'source_callback'})){ &{$LW::crawl_config{'source_callback'}}($hrin,\%hout); } LW::html_find_tags(\$hout{'whisker'}->{'data'}, \&crawl_extract_links_test); $LW::crawl_config{'stats_html'}++; # count how many pages we've parsed } if($result==3){ # follow the move via location header push @LW::crawl_urls, $hout{'location'}; } foreach $T (@LW::crawl_urls){ $T=~tr/\0\r\n//d; # the NULL character is a bug that's somewhere next if (length($T)==0); next if ($T=~/^javascript:/i); # stupid javascript next if ($T=~/^mailto:/i); next if ($T=~m#^([a-zA-Z]*)://# && lc($1) ne 'http' && lc($1) ne 'https'); next if ($T=~/^#/i); # fragment if($LW::crawl_config{'callback'} != 0){ next if &{$LW::crawl_config{'callback'}}($T,@ST); } push(@{$LW::crawl_referrers{$T}}, $ST[1]) if( $LW::crawl_config{'save_referrers'}>0 ); $T=utils_absolute_uri($T,$ST[1],1) if($LW::crawl_config{'normalize_uri'}>0); @vals=utils_split_uri($T); # slashdot bug: workaround for the following fsck'd html code: #