# -*- coding: binary -*- module Msf ### # # This module provides methods for implementing a web crawler # ### module Auxiliary::HttpCrawler include ::Msf::Auxiliary::Report def initialize(info = {}) super register_options( [ Opt::RHOST, Opt::RPORT(80), OptString.new('VHOST', [ false, "HTTP server virtual host" ]), OptString.new('URI', [ true, "The starting page to crawl", "/"]), Opt::Proxies, OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]), OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]), OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4]), OptString.new('USERNAME', [false, 'The HTTP username to specify for authentication']), OptString.new('PASSWORD', [false, 'The HTTP password to specify for authentication']), OptString.new('DOMAIN', [ true, 'The domain to use for windows authentication', 'WORKSTATION']) ], self.class ) register_advanced_options( [ OptBool.new('DirBust', [ false, 'Bruteforce common URL paths', true]), OptInt.new('RequestTimeout', [false, 'The maximum number of seconds to wait for a reply', 15]), OptInt.new('RedirectLimit', [false, 'The maximum number of redirects for a single request', 5]), OptInt.new('RetryLimit', [false, 'The maximum number of attempts for a single request', 5]), OptString.new('UserAgent', [true, 'The User-Agent header to use for all requests', "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" ]), OptString.new('BasicAuthUser', [false, 'The HTTP username to specify for basic authentication']), OptString.new('BasicAuthPass', [false, 'The HTTP password to specify for basic authentication']), OptString.new('HTTPAdditionalHeaders', [false, "A list of additional headers to send (separated by \\x01)"]), OptString.new('HTTPCookie', [false, "A HTTP cookie header to send with each request"]), OptBool.new('SSL', [ false, 'Negotiate SSL for outgoing connections', false]), OptEnum.new('SSLVersion', [ false, 'Specify the version of SSL that should be used', 'SSL3', ['SSL2', 'SSL23', 'SSL3', 'TLS1']]), ], self.class ) register_autofilter_ports([ 80, 8080, 443, 8000, 8888, 8880, 8008, 3000, 8443 ]) register_autofilter_services(%W{ http https }) begin require 'anemone' @anemone_loaded = true rescue ::Exception => e @anemone_loaded = false @anemone_error = e end end def setup raise RuntimeError, "Could not load Anemone/Nokogiri: #{@anemone_error}" if not @anemone_loaded super end def cleanup if @crawler @crawler.shutdown rescue nil @crawler = nil end super end ## # # Crawler methods and accessors # ## # A target object for tracking URLs class WebTarget < ::Hash def to_url proto = self[:ssl] ? "https" : "http" host = self[:vhost] ? self[:vhost] : self[:host] if Rex::Socket.is_ipv6?(host) host = "[#{host}]" end "#{proto}://#{host}:#{self[:port]}#{self[:path]}" end end # A custom error to signify we hit the page request cap class MaximumPageCount < ::RuntimeError end # Some accessors for stat tracking attr_accessor :targets attr_accessor :url_count, :url_total, :form_count, :request_count # Entry point for the crawler code def run self.request_count = 0 self.form_count = 0 self.url_count = 0 self.url_total = 1 path,query = datastore['URI'].split('?', 2) query ||= "" t = WebTarget.new t.merge!({ :vhost => vhost, :host => rhost, :port => rport, :ssl => ssl, :path => path, :query => query, :info => "" }) if datastore['USERNAME'] and datastore['USERNAME'] != '' t[:username] = datastore['USERNAME'].to_s t[:password] = datastore['PASSWORD'].to_s t[:domain] = datastore['DOMAIN'].to_s end if datastore['HTTPCookie'] t[:cookies] = {} datastore['HTTPCookie'].to_s.split(';').each do |pair| k,v = pair.strip.split('=', 2) next if not v t[:cookies][k] = v end end if datastore['HTTPAdditionalHeaders'] t[:headers] = datastore['HTTPAdditionalHeaders'].to_s.split("\x01").select{|x| x.to_s.length > 0} end t[:site] = report_web_site(:wait => true, :host => t[:host], :port => t[:port], :vhost => t[:vhost], :ssl => t[:ssl]) print_status("Crawling #{t.to_url}...") begin @current_vhost = t[:vhost] @current_site = t[:site] ::Timeout.timeout(max_crawl_time) { crawl_target(t) } rescue ::Timeout::Error print_error("Crawl of #{t.to_url} has reached the configured timeout") ensure @current_vhost = nil end print_status("Crawl of #{t.to_url} complete") end def get_connection_timeout datastore['RequestTimeout'] end def max_page_count datastore['MAX_PAGES'] end def max_crawl_time datastore['MAX_MINUTES'] * 60.0 end def max_crawl_threads datastore['MAX_THREADS'] end def dirbust? datastore['DirBust'] end # Scrub links that end in these extensions. If more or less is # desired by a particular module, this should get redefined. def get_link_filter /\.(js|png|jpe?g|bmp|gif|swf|jar|zip|gz|bz2|rar|pdf|docx?|pptx?)$/i end def focus_crawl(page) page.links end def crawl_target(t) cnt = 0 opts = crawler_options(t) url = t.to_url @crawler = ::Anemone::Core.new([url], opts) @crawler.on_every_page do |page| cnt += 1 self.request_count += 1 # Extract any interesting data from the page crawler_process_page(t, page, cnt) # Blow up if we hit our maximum page count if cnt >= max_page_count print_error("Maximum page count reached for #{url}") raise MaximumPageCount, "Maximum page count reached" end end # Skip link processing based on a regular expression @crawler.skip_links_like( get_link_filter ) # Focus our crawling on interesting, but not over-crawled links @crawler.focus_crawl do |page| focus_crawl(page) end begin @crawler.run rescue MaximumPageCount # No need to print anything else rescue ::Timeout::Error # Bubble this up to the top-level handler raise $! rescue ::Exception => e # Ridiculous f'ing anonymous timeout exception which I've no idea # how it comes into existence. if e.to_s =~ /execution expired/ raise ::Timeout::Error else print_error("Crawler Exception: #{url} #{e} #{e.backtrace}") end ensure @crawler.shutdown rescue nil @crawler = nil end end # Specific module implementations should redefine this method # with whatever is meaningful to them. def crawler_process_page(t, page, cnt) msg = "[#{"%.5d" % cnt}/#{"%.5d" % max_page_count}] #{page.code || "ERR"} - #{@current_site.vhost} - #{page.url}" case page.code when 301,302 if page.headers and page.headers["location"] print_status(msg + " -> " + page.headers["location"].to_s) else print_status(msg) end when 500...599 # XXX: Log the fact that we hit an error page print_good(msg) when 401,403 print_good(msg) when 200 print_status(msg) when 404 print_error(msg) else print_error(msg) end end def crawler_options(t) opts = {} opts[:user_agent] = datastore['UserAgent'] opts[:verbose] = false opts[:threads] = max_crawl_threads opts[:obey_robots_txt] = false opts[:redirect_limit] = datastore['RedirectLimit'] opts[:retry_limit] = datastore['RetryLimit'] opts[:accept_cookies] = true opts[:depth_limit] = false opts[:skip_query_strings] = false opts[:discard_page_bodies] = true opts[:framework] = framework opts[:module] = self opts[:timeout] = get_connection_timeout opts[:dirbust] = dirbust? if (t[:headers] and t[:headers].length > 0) opts[:inject_headers] = t[:headers] end if t[:cookies] opts[:cookies] = t[:cookies] end opts[:username] = t[:username] || '' opts[:password] = t[:password] || '' opts[:domain] = t[:domain] || 'WORKSTATION' opts end ## # # Wrappers for getters # ## # # Returns the target host # def rhost datastore['RHOST'] end # # Returns the remote port # def rport datastore['RPORT'] end # # Returns the VHOST of the HTTP server. # def vhost datastore['VHOST'] || datastore['RHOST'] end # # Returns the boolean indicating SSL # def ssl ((datastore.default?('SSL') and rport.to_i == 443) or datastore['SSL']) end # # Returns the string indicating SSL version # def ssl_version datastore['SSLVersion'] end # # Returns the configured proxy list # def proxies datastore['Proxies'] end end end