Add the crawler mixin and a sample form extractor crawler

git-svn-id: file:///home/svn/framework3/trunk@11025 4d416f70-5f16-0410-b530-b9f4589650da
2010-11-13 06:40:56 +00:00 · 2010-11-13 06:40:56 +00:00 · f457ccb8f7
parent bc2d43d86d
commit f457ccb8f7
4 changed files with 525 additions and 3 deletions
--- a/lib/anemone.rb
+++ b/lib/anemone.rb
@ -1,6 +1,5 @@
-require 'rubygems'
+# Load the Anemone core
 require 'anemone/core'

-# Overload the HTTP class
+# Overload the HTTP class with a variant that uses Rex::Proto::HTTP
 require 'anemone/rex_http'
-
--- a/lib/msf/core/exploit/http/crawler.rb
+++ b/lib/msf/core/exploit/http/crawler.rb
@ -0,0 +1,300 @@
+module Msf
+
+###
+#
+# This module provides methods for implementing a web crawler
+#
+###
+module Exploit::Remote::HttpCrawler
+	include Msf::Auxiliary::Report
+
+	def initialize(info = {})
+		super
+
+		register_options(
+			[
+				Opt::RHOST,
+				Opt::RPORT(80),
+				OptString.new('VHOST', [ false, "HTTP server virtual host" ]),
+				OptString.new('URI',   [ true, "The starting page to crawl", "/"]),
+				Opt::Proxies,
+				OptInt.new('MAX_PAGES', [ true, 'The maximum number of pages to crawl per URL', 500]),
+				OptInt.new('MAX_MINUTES', [ true, 'The maximum number of minutes to spend on each URL', 5]),
+				OptInt.new('MAX_THREADS', [ true, 'The maximum number of concurrent requests', 4])					
+			], self.class
+		)
+
+		register_advanced_options(
+			[
+				OptInt.new('RequestTimeout', [false, 'The maximum number of seconds to wait for a reply', 15]),
+				OptInt.new('RedirectLimit', [false, 'The maximum number of redirects for a single request', 5]),
+				OptInt.new('RetryLimit', [false, 'The maximum number of attempts for a single request', 5]),
+				OptString.new('UserAgent', [true, 'The User-Agent header to use for all requests',
+					"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+				]),
+				OptString.new('BasicAuthUser', [false, 'The HTTP username to specify for basic authentication']),
+				OptString.new('BasicAuthPass', [false, 'The HTTP password to specify for basic authentication']),
+				OptBool.new('SSL', [ false, 'Negotiate SSL for outgoing connections', false]),
+				OptEnum.new('SSLVersion', [ false, 'Specify the version of SSL that should be used', 'SSL3', ['SSL2', 'SSL23', 'SSL3', 'TLS1']]),
+			], self.class
+		)
+
+		register_autofilter_ports([ 80, 8080, 443, 8000, 8888, 8880, 8008, 3000, 8443 ])
+		register_autofilter_services(%W{ http https })
+		
+		begin
+			require 'anemone'
+			@anemone_loaded = true
+		rescue ::Exception => e
+			@anemone_loaded = false
+			@anemone_error  = e
+		end		
+	end
+
+	def setup
+		raise RuntimeError, "Could not load Anemone/Nokogiri: #{@anemone_error}" if not @anemone_loaded
+		super
+	end
+
+	def cleanup
+		if @crawler
+			@crawler.shutdown rescue nil
+			@crawler = nil
+		end
+		super
+	end
+
+	##
+	#
+	# Crawler methods and accessors
+	#
+	##
+	
+	# A target object for tracking URLs
+	class WebTarget < ::Hash
+		def to_url
+			proto = self[:ssl] ? "https" : "http"
+			"#{proto}://#{self[:host]}:#{self[:port]}#{self[:path]}"
+		end
+	end
+	
+	# A custom error to signify we hit the page request cap
+	class MaximumPageCount < ::RuntimeError
+	end
+	
+	# Some accessors for stat tracking
+	attr_accessor :targets
+	attr_accessor :url_count, :url_total, :form_count, :request_count
+
+
+	# Entry point for the crawler code
+	def run_host(ip)
+
+		print_status("Testing #{ip}...")
+		
+		self.request_count = 0
+		self.form_count  = 0
+		self.url_count   = 0
+		self.url_total   = 1
+
+		path,query = datastore['URI'].split('?', 2)
+		query ||= ""
+
+		t = WebTarget.new
+	
+		t.merge!({
+			:vhost    => vhost,
+			:host     => rhost,
+			:port     => rport,
+			:ssl      => ssl,
+			:path     => path,
+			:query    => query,
+			:user     => datastore['BasicAuthUser'],
+			:pass     => datastore['BasicAuthPass'],
+			:info     => ""
+		})
+		
+		t[:site] = report_web_site(:wait => true, :host => t[:host], :port => t[:port], :vhost => t[:vhost], :ssl => t[:ssl])
+
+		print_status("Crawling #{t.to_url}...")
+		begin
+			@current_vhost = t[:vhost]
+			@current_site  = t[:site]
+			::Timeout.timeout(max_crawl_time) { crawl_target(t) }
+		rescue ::Timeout::Error
+			print_error("Crawl of #{t.to_url} has reached the configured timeout")
+		ensure
+			@current_vhost = nil
+		end
+		print_status("Crawl of #{t.to_url} complete")
+	end
+
+	def get_connection_timeout
+		datastore['RequestTimeout']
+	end
+	
+	def max_page_count
+		datastore['MAX_PAGES']
+	end
+	
+	def max_crawl_time
+		datastore['MAX_MINUTES'] * 60.0
+	end
+	
+	def max_crawl_threads
+		datastore['MAX_THREADS']
+	end
+	
+	def get_link_filter
+		/\.(js|png|jpe?g|bmp|gif|swf|jar|zip|gz|bz2|rar|pdf|docx?|pptx?)$/i
+	end
+	
+	def focus_crawl(page)
+		page.links
+	end
+	
+	def crawl_target(t)
+		cnt  = 0
+		opts = crawler_options(t)
+		url  = t.to_url
+
+		@crawler = ::Anemone::Core.new([url], opts)
+		@crawler.on_every_page do |page|
+			cnt += 1
+			
+			self.request_count += 1
+
+			# Extract any interesting data from the page
+			crawler_process_page(t, page, cnt)
+
+			# Sync the database every 100 items
+			if cnt % 100 == 0
+				framework.db.sync
+			end
+			
+			# Blow up if we hit our maximum page count
+			if cnt >= max_page_count
+				print_error("Maximum page count reached for #{url}")
+				raise MaximumPageCount, "Maximum page count reached"
+			end
+		end
+		
+		# Skip link processing based on a regular expression
+		@crawler.skip_links_like(
+			get_link_filter
+		)
+
+		# Focus our crawling on interesting, but not over-crawled links
+		@crawler.focus_crawl do |page|
+			focus_crawl(page)
+		end
+		
+		begin
+			@crawler.run
+		rescue MaximumPageCount
+			# No need to print anything else
+		rescue ::Timeout::Error
+			# Bubble this up to the top-level handler
+			raise $!
+		rescue ::Exception => e
+			print_error("Crawler Exception: #{url} #{e} #{e.backtrace}")
+		ensure
+			@crawler.shutdown rescue nil
+			@crawler = nil
+		end
+	end
+	
+	def crawler_process_page(t, page, cnt)	
+		msg = "[#{"%.5d" % cnt}/#{"%.5d" % max_page_count}]    #{page.code || "ERR"} - #{@current_site.vhost} - #{page.url}"
+		case page.code
+			when 301,302
+				if page.headers and page.headers["location"]
+					print_status(msg + " -> " + page.headers["location"].to_s)
+				else
+					print_status(msg)
+				end
+			when 500...599
+				# XXX: Log the fact that we hit an error page
+				print_good(msg)
+			when 401,403
+				print_good(msg)
+			when 200
+				print_status(msg)
+			when 404
+				print_error(msg)
+			else
+				print_error(msg)
+		end
+	end
+	
+	def crawler_options(t)
+		opts = {}
+		opts[:user_agent]      = datastore['UserAgent']
+		opts[:verbose]         = false
+		opts[:threads]         = max_crawl_threads
+		opts[:obey_robots_txt] = false
+		opts[:redirect_limit]  = datastore['RedirectLimit']
+		opts[:retry_limit]     = datastore['RetryLimit']
+		opts[:accept_cookies]  = true
+		opts[:depth_limit]     = false
+		opts[:skip_query_strings]  = false		
+		opts[:discard_page_bodies] = true
+		opts[:framework]           = framework
+		opts[:module]              = self
+		opts[:timeout]             = get_connection_timeout
+		opts
+	end
+
+
+	##
+	#
+	# Wrappers for getters
+	#
+	##
+
+	#
+	# Returns the target host
+	#
+	def rhost
+		datastore['RHOST']
+	end
+
+	#
+	# Returns the remote port
+	#
+	def rport
+		datastore['RPORT']
+	end
+
+	#
+	# Returns the VHOST of the HTTP server.
+	#
+	def vhost
+		datastore['VHOST'] || datastore['RHOST']
+	end
+
+	#
+	# Returns the boolean indicating SSL
+	#
+	def ssl
+		((datastore.default?('SSL') and rport.to_i == 443) or datastore['SSL'])
+	end
+
+	#
+	# Returns the string indicating SSL version
+	#
+	def ssl_version
+		datastore['SSLVersion']
+	end
+
+	#
+	# Returns the configured proxy list
+	#
+	def proxies
+		datastore['Proxies']
+	end
+
+
+end
+
+end
--- a/lib/msf/core/exploit/mixins.rb
+++ b/lib/msf/core/exploit/mixins.rb
@ -34,6 +34,7 @@ require 'msf/core/exploit/telnet'
 require 'msf/core/exploit/ftpserver'
 require 'msf/core/exploit/http/client'
 require 'msf/core/exploit/http/server'
+require 'msf/core/exploit/http/crawler'
 require 'msf/core/exploit/smtp'
 require 'msf/core/exploit/dcerpc'
 require 'msf/core/exploit/sunrpc'
--- a/modules/auxiliary/scanner/http/crawler.rb
+++ b/modules/auxiliary/scanner/http/crawler.rb
@ -0,0 +1,222 @@
+##
+# $Id$
+##
+
+##
+# This file is part of the Metasploit Framework and may be subject to
+# redistribution and commercial restrictions. Please see the Metasploit
+# Framework web site for more information on licensing and terms of use.
+# http://metasploit.com/framework/
+##
+
+require 'rex/proto/http'
+require 'msf/core'
+
+
+class Metasploit3 < Msf::Auxiliary
+
+	include Msf::Exploit::Remote::HttpCrawler
+	include Msf::Auxiliary::Scanner
+
+	def initialize
+		super(
+			'Name'        => 'Web Site Crawler',
+			'Version'     => '$Revision$',
+			'Description' => 'Crawl one or more web sites and store information about what was found',
+			'Author'      => 'hdm',
+			'License'     => MSF_LICENSE
+		)
+
+	end
+
+	# Scrub links that end in these extensions
+	def get_link_filter
+		/\.(js|png|jpe?g|bmp|gif|swf|jar|zip|gz|bz2|rar|pdf|docx?|pptx?)$/i
+	end
+
+=begin
+	# Prefer dynamic content over non-dynamic
+	def focus_crawl(page)
+		page.links
+	end
+=end
+		
+	#
+	# The main callback from the crawler
+	#
+	# Data we will report:
+	# - The path of any URL found by the crawler (web.uri, :path => page.path)
+	# - The occurence of any form (web.form :path, :type (get|post|path_info), :params)
+	#
+	def crawler_process_page(t, page, cnt)	
+		msg = "[#{"%.5d" % cnt}/#{"%.5d" % max_page_count}]    #{page.code || "ERR"} - #{@current_site.vhost} - #{page.url}"
+		case page.code
+			when 301,302
+				if page.headers and page.headers["location"]
+					print_status(msg + " -> " + page.headers["location"].to_s)
+				else
+					print_status(msg)
+				end
+			when 500...599
+				# XXX: Log the fact that we hit an error page
+				print_good(msg)
+			when 401,403
+				print_good(msg)
+			when 200
+				print_status(msg)
+			when 404
+				print_error(msg)
+			else
+				print_error(msg)
+		end
+	
+		#
+		# Process the web page
+		#
+
+		info = { 
+			:web_site => @current_site,
+			:path     => page.url.path,
+			:query    => page.url.query,
+			:code     => page.code,
+			:body     => page.body,
+			:headers  => page.headers
+		}
+
+		if page.headers['content-type']
+			info[:ctype] = page.headers['content-type']
+		end
+		
+		if page.headers['set-cookie']
+			info[:cookie] = page.headers['set-cookie']
+		end
+
+		if page.headers['authorization']
+			info[:auth] = page.headers['authorization']
+		end
+
+		if page.headers['location']
+			info[:location] = page.headers['location']
+		end
+		
+		if page.headers['last-modified']
+			info[:mtime] = page.headers['last-modified']
+		end
+									
+		# Report the web page to the database
+		report_web_page(info)
+
+		# Only process interesting response codes
+		return if not [302, 301, 200, 500, 401, 403, 404].include?(page.code)
+
+		#		
+		# Skip certain types of forms right off the bat
+		#
+		
+		# Apache multiview directories
+		return if page.url.query =~ /^C=[A-Z];O=/ # Apache
+		
+		# Scrub out the jsessionid appends
+		page.url.path = page.url.path.sub(/;jsessionid=[a-zA-Z0-9]+/, '')
+		
+		#
+		# Continue processing forms
+		#		
+		forms = []
+		form_template = { :web_site => @current_site }
+		form  = {}.merge(form_template)
+		
+		# This page has a query parameter we can test with GET parameters
+		# ex: /test.php?a=b&c=d
+		if page.url.query and not page.url.query.empty?
+			form[:method] = 'GET'
+			form[:path]   = page.url.path
+			vars = page.url.query.split('&').map{|x| x.split("=", 2) }
+			form[:params] = vars
+		end
+
+		# This is a REST-ish application with numeric parameters
+		# ex: /customers/343
+		if not form[:path] and page.url.path.to_s =~ /(.*)\/(\d+)$/
+			path_base = $1
+			path_info = $2
+			form[:method] = 'PATH'
+			form[:path]   = path_base
+			form[:params] = [['PATH', path_info]]
+			form[:query]  = page.url.query.to_s
+		end
+				
+		# This is an application that uses PATH_INFO for parameters:
+		# ex:  /index.php/Main_Page/Article01
+		if not form[:path] and page.url.path.to_s =~ /(.*\/[a-z09A-Z]{3,256}\.[a-z09A-Z]{2,8})(\/.*)/
+			path_base = $1
+			path_info = $2
+			form[:method] = 'PATH'
+			form[:path]   = path_base
+			form[:params] = [['PATH', path_info]]
+			form[:query]  = page.url.query.to_s
+		end
+		
+		# Done processing URI-based forms
+		forms << form
+
+		if page.doc
+			page.doc.css("form").each do |f|
+			
+				target = page.url
+				
+				if f['action'] and not f['action'].strip.empty?
+					action = f['action']				
+					
+					# Prepend relative URLs with the current directory			
+					if action[0,1] != "/" and action !~ /\:\/\//
+						# Extract the base href first
+						base = target.path.gsub(/(.*\/)[^\/]+$/, "\\1")
+						page.doc.css("base").each do |bref|
+							if bref['href']
+								base = bref['href']
+							end
+						end					
+						action = (base + "/").sub(/\/\/$/, '/') + action
+					end
+					
+					target = page.to_absolute(URI( action )) rescue next
+					
+					if not page.in_domain?(target)
+						# Replace 127.0.0.1 and non-qualified hostnames with our page.host
+						# ex: http://localhost/url OR http://www01/url
+						target_uri = URI(target.to_s)
+						if (target_uri.host.index(".").nil? or target_uri.host == "127.0.0.1")
+							target_uri.host = page.url.host
+							target = target_uri
+						else
+							next
+						end
+					end
+				end
+				
+				form = {}.merge!(form_template)
+				form[:method] = (f['method'] || 'GET').upcase
+				form[:query]  = target.query.to_s if form[:method] != "GET"
+				form[:path]   = target.path
+				form[:params] = []
+				f.css('input', 'textarea').each do |inp|
+					form[:params] << [inp['name'].to_s, inp['value'] || inp.content || '', { :type => inp['type'].to_s }]
+				end
+				
+				# XXX: handle SELECT elements
+				
+				forms << form
+			end
+		end
+		
+		# Report each of the discovered forms
+		forms.each do |form|
+			next if not form[:method]
+			print_status((" " * 24) + "FORM: #{form[:method]} #{form[:path]}")
+			report_web_form(form)
+			self.form_count += 1
+		end
+	end
+end
+