62 lines
1.0 KiB
Ruby
62 lines
1.0 KiB
Ruby
|
require 'rubygems'
|
||
|
require 'uri'
|
||
|
|
||
|
class CrawlerBasic < BaseParser
|
||
|
|
||
|
def parse(request,result)
|
||
|
|
||
|
#puts "R: #{result.body}"
|
||
|
|
||
|
links = result.body.to_s.scan(/href\s*=\s*[\"\'](.+?)[\"\']/)
|
||
|
|
||
|
links.each do |link|
|
||
|
begin
|
||
|
uri = URI.parse(link[0])
|
||
|
|
||
|
tssl = false
|
||
|
if uri.scheme == "https"
|
||
|
tssl = true
|
||
|
else
|
||
|
tssl = false
|
||
|
end
|
||
|
|
||
|
if !uri.host or uri.host == nil
|
||
|
thost = request['rhost']
|
||
|
else
|
||
|
thost = uri.host
|
||
|
end
|
||
|
|
||
|
if !uri.port or uri.port == nil
|
||
|
tport = request['rport']
|
||
|
else
|
||
|
tport = uri.port
|
||
|
end
|
||
|
|
||
|
if !uri.path or uri.path == nil
|
||
|
tpath = "/"
|
||
|
else
|
||
|
tpath = uri.path
|
||
|
end
|
||
|
|
||
|
hreq = {
|
||
|
'rhost' => thost,
|
||
|
'rport' => tport,
|
||
|
'uri' => tpath,
|
||
|
'method' => 'GET',
|
||
|
'ctype' => 'text/plain',
|
||
|
'ssl' => tssl,
|
||
|
'query' => uri.query
|
||
|
|
||
|
}
|
||
|
#puts "R: #{hreq['uri']}"
|
||
|
insertnewpath(hreq)
|
||
|
|
||
|
rescue URI::InvalidURIError
|
||
|
#puts "Parse error"
|
||
|
#puts "Error: #{link[0]}"
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|