2010-04-26 18:29:24 +00:00
|
|
|
#!/usr/bin/env ruby
|
|
|
|
#
|
2010-05-03 17:13:09 +00:00
|
|
|
# $Id$
|
|
|
|
#
|
|
|
|
# Web Crawler.
|
2010-04-26 18:29:24 +00:00
|
|
|
#
|
|
|
|
# Author: et [at] metasploit.com 2010
|
2010-05-03 17:13:09 +00:00
|
|
|
# $Revision$
|
2010-04-26 18:29:24 +00:00
|
|
|
#
|
|
|
|
#
|
|
|
|
|
|
|
|
# openssl before rubygems mac os
|
|
|
|
require 'openssl'
|
|
|
|
require 'rubygems'
|
2010-04-08 03:39:17 +00:00
|
|
|
require 'rinda/tuplespace'
|
2010-04-26 18:29:24 +00:00
|
|
|
require 'pathname'
|
|
|
|
require 'uri'
|
|
|
|
|
|
|
|
begin
|
|
|
|
require 'sqlite3'
|
|
|
|
rescue LoadError
|
|
|
|
puts "Error: sqlite3-ruby not found"
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
msfbase = File.symlink?(__FILE__) ? File.readlink(__FILE__) : __FILE__
|
|
|
|
$:.unshift(File.join(File.dirname(msfbase), '..', 'lib'))
|
|
|
|
|
|
|
|
require 'rex'
|
|
|
|
require 'msf/ui'
|
|
|
|
require 'msf/base'
|
|
|
|
|
|
|
|
|
|
|
|
# Sleep time (secs) between requests
|
|
|
|
$sleeptime = 0
|
|
|
|
|
|
|
|
# Timeout for loop ending
|
|
|
|
$taketimeout = 15
|
|
|
|
|
|
|
|
# Read timeout (-1 forever)
|
2010-05-03 17:13:09 +00:00
|
|
|
$readtimeout = -1
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
# Directory containing crawler modules
|
|
|
|
$crawlermodulesdir = File.join(File.dirname(msfbase),"..", "data", "msfcrawler")
|
|
|
|
|
|
|
|
# Database
|
|
|
|
$dbpathmsf = File.join(Msf::Config.get_config_root, 'sqlite3.db')
|
|
|
|
|
|
|
|
# Store in database?
|
|
|
|
$dbs = false
|
|
|
|
|
|
|
|
# Thread number
|
|
|
|
$threadnum = 20
|
|
|
|
|
|
|
|
# Dont crawl
|
|
|
|
$dontcrawl = ".exe,.zip,.tar,.bz2,.run,.asc,.gz,"
|
|
|
|
|
|
|
|
# Use proxy
|
|
|
|
$useproxy = false
|
|
|
|
|
|
|
|
# Proxy host
|
|
|
|
$proxyhost = '127.0.0.1'
|
|
|
|
|
|
|
|
# Proxy Port
|
|
|
|
$proxyport = 8080
|
|
|
|
|
|
|
|
# Cookie Jar
|
|
|
|
$cookiejar = {}
|
|
|
|
|
|
|
|
# Verbose
|
2010-04-07 03:33:21 +00:00
|
|
|
$verbose = false
|
|
|
|
|
|
|
|
# Enable URI Limits
|
|
|
|
$enableul = true
|
|
|
|
|
|
|
|
# Maximum number of requests per URI (check $enableul)
|
|
|
|
$maxurilimit = 1
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
class HttpCrawler
|
|
|
|
attr_accessor :ctarget, :cport, :cinipath, :cssl, :proxyhost, :proxyport, :useproxy
|
|
|
|
|
|
|
|
def initialize(target,port,inipath,ssl,proxyhost,proxyport,useproxy)
|
|
|
|
self.ctarget = target
|
|
|
|
self.cport = port
|
|
|
|
self.cssl = ssl
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
self.useproxy = useproxy
|
|
|
|
self.proxyhost = proxyhost
|
|
|
|
self.proxyport = proxyport
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
self.cinipath = (inipath.nil? or inipath.empty?) ? '/' : inipath
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
inireq = {
|
|
|
|
'rhost' => self.ctarget,
|
|
|
|
'rport' => self.cport,
|
|
|
|
'uri' => self.cinipath,
|
|
|
|
'method' => 'GET',
|
|
|
|
'ctype' => 'text/plain',
|
|
|
|
'ssl' => self.cssl,
|
|
|
|
'query' => nil,
|
|
|
|
'data' => nil
|
|
|
|
}
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
@NotViewedQueue = Rinda::TupleSpace.new
|
2010-04-07 03:33:21 +00:00
|
|
|
@ViewedQueue = Hash.new
|
2010-04-26 18:29:24 +00:00
|
|
|
@UriLimits = Hash.new
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
insertnewpath(inireq)
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
puts "Loading modules: #{$crawlermodulesdir}"
|
|
|
|
load_modules
|
|
|
|
puts "OK"
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def reqtemplate(target,port,ssl)
|
|
|
|
hreq = {
|
|
|
|
'rhost' => target,
|
|
|
|
'rport' => port,
|
|
|
|
'uri' => nil,
|
|
|
|
'method' => nil,
|
|
|
|
'ctype' => nil,
|
|
|
|
'ssl' => ssl,
|
|
|
|
'query' => nil,
|
2010-05-03 17:13:09 +00:00
|
|
|
'data' => nil
|
2010-04-26 18:29:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return hreq
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def storedb(hashreq,response,dbpath)
|
|
|
|
#postgres , pg gem
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
db = SQLite3::Database.new(dbpath)
|
|
|
|
#db = Mysql.new("127.0.0.1", username, password, databasename)
|
|
|
|
until !db.transaction_active?
|
|
|
|
#puts "Waiting for db"
|
|
|
|
#wait
|
|
|
|
end
|
|
|
|
#puts "db: #{db.transaction_active?}"
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
#CREATE TABLE "wmap_requests" (
|
2010-05-03 17:13:09 +00:00
|
|
|
# "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
|
|
|
|
# "host" varchar(255),
|
|
|
|
# "address" varchar(16),
|
|
|
|
# "address6" varchar(255),
|
|
|
|
# "port" integer,
|
|
|
|
# "ssl" integer,
|
|
|
|
# "meth" varchar(32),
|
|
|
|
# "path" text,
|
|
|
|
# "headers" text,
|
|
|
|
# "query" text,
|
|
|
|
# "body" text,
|
|
|
|
# "respcode" varchar(16),
|
|
|
|
# "resphead" text,
|
|
|
|
# "response" text,
|
2010-04-26 18:29:24 +00:00
|
|
|
# "created_at" datetime);
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
db.transaction db.execute( "insert into wmap_requests (host,address,address6,port,ssl,meth,path,headers,query,body,respcode,resphead,response,created_at,updated_at) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
|
|
|
|
hashreq['rhost'],
|
|
|
|
hashreq['rhost'],
|
2010-05-03 17:13:09 +00:00
|
|
|
hashreq['rhost'],
|
2010-04-26 18:29:24 +00:00
|
|
|
hashreq['rport'].to_i,
|
|
|
|
hashreq['ssl']? 1:0,
|
|
|
|
hashreq['method'],
|
|
|
|
SQLite3::Blob.new(hashreq['uri']),
|
|
|
|
SQLite3::Blob.new(''),
|
|
|
|
SQLite3::Blob.new(hashreq['query']? hashreq['query']:''),
|
|
|
|
SQLite3::Blob.new(hashreq['data']? hashreq['data']:''),
|
|
|
|
response.code.to_s,
|
|
|
|
SQLite3::Blob.new(''),
|
|
|
|
SQLite3::Blob.new(response.body.to_s),
|
|
|
|
Time.new,
|
|
|
|
Time.new
|
2010-05-03 17:13:09 +00:00
|
|
|
)
|
2010-04-26 18:29:24 +00:00
|
|
|
db.commit
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
db.close
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def run
|
|
|
|
i, a = 0, []
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
begin
|
|
|
|
reqfilter = reqtemplate(self.ctarget,self.cport,self.cssl)
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
loop do
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
####
|
|
|
|
#if i <= $threadnum
|
|
|
|
# a.push(Thread.new {
|
|
|
|
####
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-07 03:33:21 +00:00
|
|
|
hashreq = @NotViewedQueue.take(reqfilter, $taketimeout)
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
ul = false
|
2010-04-07 03:33:21 +00:00
|
|
|
if @UriLimits.include?(hashreq['uri']) and $enableul
|
2010-05-03 17:13:09 +00:00
|
|
|
#puts "Request #{@UriLimits[hashreq['uri']]}/#{$maxurilimit} #{hashreq['uri']}"
|
|
|
|
if @UriLimits[hashreq['uri']] >= $maxurilimit
|
|
|
|
#puts "URI LIMIT Reached: #{$maxurilimit} for uri #{hashreq['uri']}"
|
|
|
|
ul = true
|
2010-04-07 03:33:21 +00:00
|
|
|
end
|
|
|
|
else
|
2010-05-03 17:13:09 +00:00
|
|
|
@UriLimits[hashreq['uri']] = 0
|
2010-04-26 18:29:24 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
if !@ViewedQueue.include?(hashsig(hashreq)) and !ul
|
|
|
|
|
2010-04-07 03:33:21 +00:00
|
|
|
@ViewedQueue[hashsig(hashreq)] = Time.now
|
|
|
|
@UriLimits[hashreq['uri']] += 1
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
if !File.extname(hashreq['uri']).empty? and $dontcrawl.include? File.extname(hashreq['uri'])
|
|
|
|
if $verbose
|
|
|
|
puts "URI not crawled #{hashreq['uri']}"
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
else
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
prx = nil
|
|
|
|
if self.useproxy
|
|
|
|
prx = "HTTP:"+self.proxyhost.to_s+":"+self.proxyport.to_s
|
|
|
|
end
|
|
|
|
|
|
|
|
c = Rex::Proto::Http::Client.new(
|
|
|
|
self.ctarget,
|
|
|
|
self.cport.to_i,
|
|
|
|
{},
|
|
|
|
self.cssl,
|
|
|
|
nil,
|
|
|
|
prx
|
|
|
|
)
|
|
|
|
|
|
|
|
sendreq(c,hashreq)
|
|
|
|
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
end
|
2010-04-26 18:29:24 +00:00
|
|
|
else
|
|
|
|
if $verbose
|
|
|
|
puts "#{hashreq['uri']} already visited at #{@ViewedQueue[hashsig(hashreq)]}"
|
|
|
|
end
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
####
|
|
|
|
#})
|
|
|
|
|
2010-05-03 17:13:09 +00:00
|
|
|
#i += 1
|
2010-04-26 18:29:24 +00:00
|
|
|
#else
|
|
|
|
# sleep(0.01) and a.delete_if {|x| not x.alive?} while not a.empty?
|
|
|
|
# i = 0
|
|
|
|
#end
|
|
|
|
####
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
end
|
2010-04-26 18:29:24 +00:00
|
|
|
rescue Rinda::RequestExpiredError
|
|
|
|
puts "END."
|
|
|
|
return
|
|
|
|
end
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
#
|
|
|
|
# Modified version of load_protocols from psnuffle by Max Moser <mmo@remote-exploit.org>
|
|
|
|
#
|
|
|
|
def load_modules
|
|
|
|
base = $crawlermodulesdir
|
|
|
|
if (not File.directory?(base))
|
|
|
|
raise RuntimeError,"The Crawler modules parameter is set to an invalid directory"
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
@crawlermodules = {}
|
|
|
|
cmodules = Dir.new(base).entries.grep(/\.rb$/).sort
|
|
|
|
cmodules.each do |n|
|
|
|
|
f = File.join(base, n)
|
|
|
|
m = ::Module.new
|
|
|
|
begin
|
|
|
|
m.module_eval(File.read(f, File.size(f)))
|
|
|
|
m.constants.grep(/^Crawler(.*)/) do
|
|
|
|
cmod = $1
|
|
|
|
klass = m.const_get("Crawler#{cmod}")
|
|
|
|
@crawlermodules[cmod.downcase] = klass.new(self)
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
puts("Loaded crawler module #{cmod} from #{f}...")
|
|
|
|
end
|
|
|
|
rescue ::Exception => e
|
|
|
|
puts("Crawler module #{n} failed to load: #{e.class} #{e} #{e.backtrace}")
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
def sendreq(nclient,reqopts={})
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
begin
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
r = nclient.request_raw(reqopts)
|
|
|
|
resp = nclient.send_recv(r, $readtimeout)
|
|
|
|
while(resp and resp.code == 100)
|
|
|
|
resp = nclient.reread_response(resp, $readtimeout)
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
if resp
|
|
|
|
#
|
|
|
|
# Quickfix for bug packet.rb to_s line: 190
|
2010-05-03 17:13:09 +00:00
|
|
|
# In case modules or crawler calls to_s on de-chunked responses
|
2010-04-26 18:29:24 +00:00
|
|
|
#
|
|
|
|
resp.transfer_chunked = false
|
|
|
|
if resp['Set-Cookie']
|
|
|
|
#puts "Set Cookie: #{resp['Set-Cookie']}"
|
|
|
|
#puts "Storing in cookie jar for host:port #{reqopts['rhost']}:#{reqopts['rport']}"
|
2010-05-03 17:13:09 +00:00
|
|
|
#$cookiejar["#{reqopts['rhost']}:#{reqopts['rport']}"] = resp['Set-Cookie']
|
2010-04-26 18:29:24 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
if $dbs
|
|
|
|
storedb(reqopts,resp,$dbpathmsf)
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
puts ">> [#{resp.code}] #{reqopts['uri']}"
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
if reqopts['query'] and !reqopts['query'].empty?
|
2010-05-03 17:13:09 +00:00
|
|
|
puts ">>> [Q] #{reqopts['query']}"
|
2010-04-26 18:29:24 +00:00
|
|
|
end
|
|
|
|
|
2010-05-03 17:13:09 +00:00
|
|
|
if reqopts['data']
|
|
|
|
puts ">>> [D] #{reqopts['data']}"
|
2010-04-26 18:29:24 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
case resp.code
|
|
|
|
when 200
|
|
|
|
@crawlermodules.each_key do |k|
|
|
|
|
@crawlermodules[k].parse(reqopts,resp)
|
|
|
|
end
|
|
|
|
when 301..303
|
|
|
|
puts "[#{resp.code}] Redirection to: #{resp['Location']}"
|
|
|
|
if $verbose
|
|
|
|
puts urltohash('GET',resp['Location'],reqopts['uri'],nil)
|
|
|
|
end
|
|
|
|
insertnewpath(urltohash('GET',resp['Location'],reqopts['uri'],nil))
|
|
|
|
when 404
|
2010-05-03 17:13:09 +00:00
|
|
|
puts "[404] Invalid link #{reqopts['uri']}"
|
2010-04-26 18:29:24 +00:00
|
|
|
else
|
|
|
|
puts "Unhandled #{resp.code}"
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
else
|
|
|
|
puts "No response"
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
sleep($sleeptime)
|
2010-04-26 18:29:24 +00:00
|
|
|
rescue
|
|
|
|
puts "ERROR"
|
|
|
|
if $verbose
|
|
|
|
puts "#{$!}: #{$!.backtrace}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# Add new path (uri) to test non-viewed queue
|
|
|
|
#
|
2010-04-08 03:39:17 +00:00
|
|
|
def insertnewpath(hashreq)
|
|
|
|
|
|
|
|
hashreq['uri'] = canonicalize(hashreq['uri'])
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
if hashreq['rhost'] == self.ctarget and hashreq['rport'] == self.cport
|
2010-05-03 17:13:09 +00:00
|
|
|
if !@ViewedQueue.include?(hashsig(hashreq))
|
2010-04-26 18:29:24 +00:00
|
|
|
if @NotViewedQueue.read_all(hashreq).size > 0
|
|
|
|
if $verbose
|
|
|
|
puts "Already in queue to be viewed"
|
|
|
|
end
|
|
|
|
else
|
|
|
|
if $verbose
|
|
|
|
puts "Inserted: #{hashreq['uri']}"
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
@NotViewedQueue.write(hashreq)
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
2010-04-26 18:29:24 +00:00
|
|
|
else
|
|
|
|
if $verbose
|
|
|
|
puts "#{hashreq['uri']} already visited at #{@ViewedQueue[hashsig(hashreq)]}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
#
|
|
|
|
# Build a new hash for a local path
|
|
|
|
#
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-08 03:39:17 +00:00
|
|
|
def urltohash(m,url,basepath,dat)
|
|
|
|
# m: method
|
|
|
|
# url: uri?[query]
|
|
|
|
# basepath: base path/uri to determine absolute path when relative
|
|
|
|
# data: body data, nil if GET and query = uri.query
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
uri = URI.parse(url)
|
|
|
|
uritargetssl = (uri.scheme == "https") ? true : false
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-08 03:39:17 +00:00
|
|
|
uritargethost = uri.host
|
2010-05-03 17:13:09 +00:00
|
|
|
if (uri.host.nil? or uri.host.empty?)
|
2010-04-26 18:29:24 +00:00
|
|
|
uritargethost = self.ctarget
|
|
|
|
uritargetssl = self.cssl
|
2010-04-08 03:39:17 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-08 03:39:17 +00:00
|
|
|
uritargetport = uri.port
|
2010-05-03 17:13:09 +00:00
|
|
|
if (uri.port.nil?)
|
2010-04-26 18:29:24 +00:00
|
|
|
uritargetport = self.cport
|
2010-04-08 03:39:17 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
uritargetpath = uri.path
|
2010-05-03 17:13:09 +00:00
|
|
|
if (uri.path.nil? or uri.path.empty?)
|
2010-04-26 18:29:24 +00:00
|
|
|
uritargetpath = "/"
|
2010-04-08 03:39:17 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
newp = Pathname.new(uritargetpath)
|
|
|
|
oldp = Pathname.new(basepath)
|
|
|
|
if !newp.absolute?
|
|
|
|
if oldp.to_s[-1,1] == '/'
|
|
|
|
newp = oldp+newp
|
|
|
|
else
|
|
|
|
if !newp.to_s.empty?
|
|
|
|
newp = File.join(oldp.dirname,newp)
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
hashreq = {
|
|
|
|
'rhost' => uritargethost,
|
|
|
|
'rport' => uritargetport,
|
|
|
|
'uri' => newp.to_s,
|
|
|
|
'method' => m,
|
|
|
|
'ctype' => 'text/plain',
|
|
|
|
'ssl' => uritargetssl,
|
|
|
|
'query' => uri.query,
|
|
|
|
'data' => nil
|
2010-04-08 03:39:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if m == 'GET' and !dat.nil?
|
|
|
|
hashreq['query'] = dat
|
|
|
|
else
|
2010-05-03 17:13:09 +00:00
|
|
|
hashreq['data'] = dat
|
2010-04-08 03:39:17 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
return hashreq
|
2010-04-08 03:39:17 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-08 03:39:17 +00:00
|
|
|
# Taken from http://www.ruby-forum.com/topic/140101 by Rob Biedenharn
|
|
|
|
def canonicalize(uri)
|
|
|
|
u = uri.kind_of?(URI) ? uri : URI.parse(uri.to_s)
|
|
|
|
u.normalize!
|
|
|
|
newpath = u.path
|
|
|
|
while newpath.gsub!(%r{([^/]+)/\.\./?}) { |match|
|
|
|
|
$1 == '..' ? match : ''
|
|
|
|
} do end
|
|
|
|
newpath = newpath.gsub(%r{/\./}, '/').sub(%r{/\.\z}, '/')
|
|
|
|
u.path = newpath
|
|
|
|
# Ugly fix
|
|
|
|
u.path = u.path.gsub("\/..\/","\/")
|
|
|
|
u.to_s
|
|
|
|
end
|
|
|
|
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def hashsig(hashreq)
|
|
|
|
hashreq.to_s
|
|
|
|
end
|
|
|
|
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
class BaseParser
|
|
|
|
attr_accessor :crawler
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def initialize(c)
|
|
|
|
self.crawler = c
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
def parse(request,result)
|
|
|
|
nil
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
#
|
|
|
|
# Add new path (uri) to test hash queue
|
|
|
|
#
|
|
|
|
def insertnewpath(hashreq)
|
|
|
|
self.crawler.insertnewpath(hashreq)
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def hashsig(hashreq)
|
|
|
|
self.crawler.hashsig(hashreq)
|
2010-04-08 03:39:17 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
def urltohash(m,url,basepath,dat)
|
2010-05-03 17:13:09 +00:00
|
|
|
self.crawler.urltohash(m,url,basepath,dat)
|
2010-04-26 18:29:24 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def targetssl
|
|
|
|
self.crawler.cssl
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def targetport
|
|
|
|
self.crawler.cport
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def targethost
|
|
|
|
self.crawler.ctarget
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
def targetinipath
|
|
|
|
self.crawler.cinipath
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
2010-05-03 17:13:09 +00:00
|
|
|
trap("INT") {
|
2010-04-26 18:29:24 +00:00
|
|
|
exit()
|
|
|
|
}
|
|
|
|
|
|
|
|
$args = Rex::Parser::Arguments.new(
|
|
|
|
"-t" => [ true, "Target URI" ],
|
|
|
|
"-d" => [ false, "Enable database" ],
|
|
|
|
"-u" => [ true, "Use proxy"],
|
|
|
|
"-x" => [ true, "Proxy host" ],
|
|
|
|
"-p" => [ true, "Proxy port" ],
|
|
|
|
"-h" => [ false, "Display this help information"],
|
|
|
|
"-v" => [ false, "Verbose" ]
|
|
|
|
)
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
if ARGV.length < 1
|
2010-04-26 18:29:24 +00:00
|
|
|
puts("\n" + " Usage: #{$0} <options>\n" + $args.usage)
|
|
|
|
exit
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
turl = nil
|
2010-04-26 18:29:24 +00:00
|
|
|
$args.parse(ARGV) { |opt, idx, val|
|
|
|
|
case opt
|
|
|
|
when "-d"
|
|
|
|
$dbs = true
|
|
|
|
when "-t"
|
|
|
|
$crun = true
|
|
|
|
turl = val
|
|
|
|
when "-u"
|
|
|
|
$useproxy = true
|
|
|
|
when "-v"
|
2010-05-03 17:13:09 +00:00
|
|
|
$verbose = true
|
2010-04-26 18:29:24 +00:00
|
|
|
when "-x"
|
|
|
|
$proxyhost = val
|
|
|
|
when "-p"
|
2010-05-03 17:13:09 +00:00
|
|
|
$proxyposrt = val
|
2010-04-26 18:29:24 +00:00
|
|
|
when "-h"
|
|
|
|
puts("\n" + " Usage: #{$0} <options>\n" + $args.usage)
|
|
|
|
exit
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
}
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
if $crun
|
|
|
|
uri = URI.parse(turl)
|
|
|
|
tssl = (uri.scheme == "https") ? true : false
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
if (uri.host.nil? or uri.host.empty?)
|
2010-04-26 18:29:24 +00:00
|
|
|
puts "Error: target http(s)://target/path"
|
|
|
|
exit
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
if $useproxy
|
2010-05-03 17:13:09 +00:00
|
|
|
puts "Using proxy: #{$proxyhost}:#{$proxyport}"
|
2010-04-26 18:29:24 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-04-26 18:29:24 +00:00
|
|
|
mc = HttpCrawler.new(uri.host,uri.port,uri.path,tssl,$proxyhost, $proxyport, $useproxy)
|
|
|
|
if $dbs
|
|
|
|
puts "Database: #{$dbpathmsf}"
|
|
|
|
else
|
|
|
|
puts "[DATABASE DISABLED]"
|
2010-04-07 03:33:21 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
if $enableul
|
|
|
|
puts "URI LIMITS ENABLED: #{$maxurilimit}"
|
|
|
|
end
|
2010-04-26 18:29:24 +00:00
|
|
|
|
|
|
|
puts "Target: #{mc.ctarget} Port: #{mc.cport} Path: #{mc.cinipath} SSL: #{mc.cssl}"
|
2010-05-03 17:13:09 +00:00
|
|
|
mc.run
|
2010-04-26 18:29:24 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
|