299 lines
6.7 KiB
Ruby
299 lines
6.7 KiB
Ruby
module Msf
|
|
|
|
###
|
|
#
|
|
# This module provides methods for implementing a web crawler
|
|
#
|
|
###
|
|
module Auxiliary::HttpCrawler
|
|
include Msf::Auxiliary::Report
|
|
|
|
def initialize(info = {})
|
|
super
|
|
|
|
register_options(
|
|
[
|
|
Opt::RHOST,
|
|
Opt::RPORT(80),
|
|
OptString.new('VHOST', [ false, "HTTP server virtual host" ]),
|
|
OptString.new('URI', [ true, "The starting page to crawl", "/"]),
|
|
Opt::Proxies,
|
|
OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]),
|
|
OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]),
|
|
OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4])
|
|
], self.class
|
|
)
|
|
|
|
register_advanced_options(
|
|
[
|
|
OptInt.new('RequestTimeout', [false, 'The maximum number of seconds to wait for a reply', 15]),
|
|
OptInt.new('RedirectLimit', [false, 'The maximum number of redirects for a single request', 5]),
|
|
OptInt.new('RetryLimit', [false, 'The maximum number of attempts for a single request', 5]),
|
|
OptString.new('UserAgent', [true, 'The User-Agent header to use for all requests',
|
|
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
]),
|
|
OptString.new('BasicAuthUser', [false, 'The HTTP username to specify for basic authentication']),
|
|
OptString.new('BasicAuthPass', [false, 'The HTTP password to specify for basic authentication']),
|
|
OptBool.new('SSL', [ false, 'Negotiate SSL for outgoing connections', false]),
|
|
OptEnum.new('SSLVersion', [ false, 'Specify the version of SSL that should be used', 'SSL3', ['SSL2', 'SSL23', 'SSL3', 'TLS1']]),
|
|
], self.class
|
|
)
|
|
|
|
register_autofilter_ports([ 80, 8080, 443, 8000, 8888, 8880, 8008, 3000, 8443 ])
|
|
register_autofilter_services(%W{ http https })
|
|
|
|
begin
|
|
require 'anemone'
|
|
@anemone_loaded = true
|
|
rescue ::Exception => e
|
|
@anemone_loaded = false
|
|
@anemone_error = e
|
|
end
|
|
end
|
|
|
|
def setup
|
|
raise RuntimeError, "Could not load Anemone/Nokogiri: #{@anemone_error}" if not @anemone_loaded
|
|
super
|
|
end
|
|
|
|
def cleanup
|
|
if @crawler
|
|
@crawler.shutdown rescue nil
|
|
@crawler = nil
|
|
end
|
|
super
|
|
end
|
|
|
|
##
|
|
#
|
|
# Crawler methods and accessors
|
|
#
|
|
##
|
|
|
|
# A target object for tracking URLs
|
|
class WebTarget < ::Hash
|
|
def to_url
|
|
proto = self[:ssl] ? "https" : "http"
|
|
"#{proto}://#{self[:host]}:#{self[:port]}#{self[:path]}"
|
|
end
|
|
end
|
|
|
|
# A custom error to signify we hit the page request cap
|
|
class MaximumPageCount < ::RuntimeError
|
|
end
|
|
|
|
# Some accessors for stat tracking
|
|
attr_accessor :targets
|
|
attr_accessor :url_count, :url_total, :form_count, :request_count
|
|
|
|
|
|
# Entry point for the crawler code
|
|
def run
|
|
|
|
self.request_count = 0
|
|
self.form_count = 0
|
|
self.url_count = 0
|
|
self.url_total = 1
|
|
|
|
path,query = datastore['URI'].split('?', 2)
|
|
query ||= ""
|
|
|
|
t = WebTarget.new
|
|
|
|
t.merge!({
|
|
:vhost => vhost,
|
|
:host => rhost,
|
|
:port => rport,
|
|
:ssl => ssl,
|
|
:path => path,
|
|
:query => query,
|
|
:user => datastore['BasicAuthUser'],
|
|
:pass => datastore['BasicAuthPass'],
|
|
:info => ""
|
|
})
|
|
|
|
t[:site] = report_web_site(:wait => true, :host => t[:host], :port => t[:port], :vhost => t[:vhost], :ssl => t[:ssl])
|
|
|
|
print_status("Crawling #{t.to_url}...")
|
|
begin
|
|
@current_vhost = t[:vhost]
|
|
@current_site = t[:site]
|
|
::Timeout.timeout(max_crawl_time) { crawl_target(t) }
|
|
rescue ::Timeout::Error
|
|
print_error("Crawl of #{t.to_url} has reached the configured timeout")
|
|
ensure
|
|
@current_vhost = nil
|
|
end
|
|
print_status("Crawl of #{t.to_url} complete")
|
|
end
|
|
|
|
def get_connection_timeout
|
|
datastore['RequestTimeout']
|
|
end
|
|
|
|
def max_page_count
|
|
datastore['MAX_PAGES']
|
|
end
|
|
|
|
def max_crawl_time
|
|
datastore['MAX_MINUTES'] * 60.0
|
|
end
|
|
|
|
def max_crawl_threads
|
|
datastore['MAX_THREADS']
|
|
end
|
|
|
|
def get_link_filter
|
|
/\.(js|png|jpe?g|bmp|gif|swf|jar|zip|gz|bz2|rar|pdf|docx?|pptx?)$/i
|
|
end
|
|
|
|
def focus_crawl(page)
|
|
page.links
|
|
end
|
|
|
|
def crawl_target(t)
|
|
cnt = 0
|
|
opts = crawler_options(t)
|
|
url = t.to_url
|
|
|
|
@crawler = ::Anemone::Core.new([url], opts)
|
|
@crawler.on_every_page do |page|
|
|
cnt += 1
|
|
|
|
self.request_count += 1
|
|
|
|
# Extract any interesting data from the page
|
|
crawler_process_page(t, page, cnt)
|
|
|
|
# Sync the database every 100 items
|
|
if cnt % 100 == 0
|
|
framework.db.sync
|
|
end
|
|
|
|
# Blow up if we hit our maximum page count
|
|
if cnt >= max_page_count
|
|
print_error("Maximum page count reached for #{url}")
|
|
raise MaximumPageCount, "Maximum page count reached"
|
|
end
|
|
end
|
|
|
|
# Skip link processing based on a regular expression
|
|
@crawler.skip_links_like(
|
|
get_link_filter
|
|
)
|
|
|
|
# Focus our crawling on interesting, but not over-crawled links
|
|
@crawler.focus_crawl do |page|
|
|
focus_crawl(page)
|
|
end
|
|
|
|
begin
|
|
@crawler.run
|
|
rescue MaximumPageCount
|
|
# No need to print anything else
|
|
rescue ::Timeout::Error
|
|
# Bubble this up to the top-level handler
|
|
raise $!
|
|
rescue ::Exception => e
|
|
print_error("Crawler Exception: #{url} #{e} #{e.backtrace}")
|
|
ensure
|
|
@crawler.shutdown rescue nil
|
|
@crawler = nil
|
|
end
|
|
end
|
|
|
|
def crawler_process_page(t, page, cnt)
|
|
msg = "[#{"%.5d" % cnt}/#{"%.5d" % max_page_count}] #{page.code || "ERR"} - #{@current_site.vhost} - #{page.url}"
|
|
case page.code
|
|
when 301,302
|
|
if page.headers and page.headers["location"]
|
|
print_status(msg + " -> " + page.headers["location"].to_s)
|
|
else
|
|
print_status(msg)
|
|
end
|
|
when 500...599
|
|
# XXX: Log the fact that we hit an error page
|
|
print_good(msg)
|
|
when 401,403
|
|
print_good(msg)
|
|
when 200
|
|
print_status(msg)
|
|
when 404
|
|
print_error(msg)
|
|
else
|
|
print_error(msg)
|
|
end
|
|
end
|
|
|
|
def crawler_options(t)
|
|
opts = {}
|
|
opts[:user_agent] = datastore['UserAgent']
|
|
opts[:verbose] = false
|
|
opts[:threads] = max_crawl_threads
|
|
opts[:obey_robots_txt] = false
|
|
opts[:redirect_limit] = datastore['RedirectLimit']
|
|
opts[:retry_limit] = datastore['RetryLimit']
|
|
opts[:accept_cookies] = true
|
|
opts[:depth_limit] = false
|
|
opts[:skip_query_strings] = false
|
|
opts[:discard_page_bodies] = true
|
|
opts[:framework] = framework
|
|
opts[:module] = self
|
|
opts[:timeout] = get_connection_timeout
|
|
opts
|
|
end
|
|
|
|
|
|
##
|
|
#
|
|
# Wrappers for getters
|
|
#
|
|
##
|
|
|
|
#
|
|
# Returns the target host
|
|
#
|
|
def rhost
|
|
datastore['RHOST']
|
|
end
|
|
|
|
#
|
|
# Returns the remote port
|
|
#
|
|
def rport
|
|
datastore['RPORT']
|
|
end
|
|
|
|
#
|
|
# Returns the VHOST of the HTTP server.
|
|
#
|
|
def vhost
|
|
datastore['VHOST'] || datastore['RHOST']
|
|
end
|
|
|
|
#
|
|
# Returns the boolean indicating SSL
|
|
#
|
|
def ssl
|
|
((datastore.default?('SSL') and rport.to_i == 443) or datastore['SSL'])
|
|
end
|
|
|
|
#
|
|
# Returns the string indicating SSL version
|
|
#
|
|
def ssl_version
|
|
datastore['SSLVersion']
|
|
end
|
|
|
|
#
|
|
# Returns the configured proxy list
|
|
#
|
|
def proxies
|
|
datastore['Proxies']
|
|
end
|
|
|
|
|
|
end
|
|
|
|
end
|