2010-05-03 17:13:09 +00:00
|
|
|
##
|
|
|
|
# $Id$
|
|
|
|
##
|
|
|
|
|
|
|
|
##
|
|
|
|
# This file is part of the Metasploit Framework and may be subject to
|
|
|
|
# redistribution and commercial restrictions. Please see the Metasploit
|
|
|
|
# Framework web site for more information on licensing and terms of use.
|
|
|
|
# http://metasploit.com/framework/
|
|
|
|
##
|
|
|
|
|
|
|
|
# $Revision$
|
|
|
|
|
2010-01-26 04:21:07 +00:00
|
|
|
require 'rubygems'
|
2010-02-06 05:16:29 +00:00
|
|
|
require 'pathname'
|
2010-03-21 00:13:12 +00:00
|
|
|
require 'hpricot'
|
2010-01-26 04:21:07 +00:00
|
|
|
require 'uri'
|
|
|
|
|
2010-03-21 00:13:12 +00:00
|
|
|
class CrawlerSimple < BaseParser
|
2010-01-26 04:21:07 +00:00
|
|
|
|
|
|
|
def parse(request,result)
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-03-21 00:13:12 +00:00
|
|
|
if !result['Content-Type'].include? "text/html"
|
|
|
|
return
|
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-03-21 00:13:12 +00:00
|
|
|
doc = Hpricot(result.body.to_s)
|
|
|
|
doc.search('a').each do |link|
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-03-21 00:13:12 +00:00
|
|
|
hr = link.attributes['href']
|
2010-05-03 17:13:09 +00:00
|
|
|
|
|
|
|
if hr and !hr.match(/^(\#|javascript\:)/)
|
2010-01-26 04:21:07 +00:00
|
|
|
begin
|
2010-05-03 17:13:09 +00:00
|
|
|
hreq = urltohash('GET',hr,request['uri'],nil)
|
|
|
|
|
2010-01-26 04:21:07 +00:00
|
|
|
insertnewpath(hreq)
|
2010-05-03 17:13:09 +00:00
|
|
|
|
2010-03-21 00:13:12 +00:00
|
|
|
rescue URI::InvalidURIError
|
2010-01-26 04:21:07 +00:00
|
|
|
#puts "Parse error"
|
|
|
|
#puts "Error: #{link[0]}"
|
|
|
|
end
|
|
|
|
end
|
2010-03-21 00:13:12 +00:00
|
|
|
end
|
2010-05-03 17:13:09 +00:00
|
|
|
end
|
2010-01-26 04:21:07 +00:00
|
|
|
end
|
|
|
|
|