Using hpricot

git-svn-id: file:///home/svn/framework3/trunk@8862 4d416f70-5f16-0410-b530-b9f4589650da
unstable
et 2010-03-21 00:13:12 +00:00
parent eb61f72431
commit 5949b91612
1 changed files with 25 additions and 13 deletions

View File

@ -1,18 +1,26 @@
require 'rubygems' require 'rubygems'
require 'pathname' require 'pathname'
require 'hpricot'
require 'uri' require 'uri'
class CrawlerBasic < BaseParser class CrawlerSimple < BaseParser
def parse(request,result) def parse(request,result)
#puts "R: #{result.body}" if !result['Content-Type'].include? "text/html"
return
end
links = result.body.to_s.scan(/href\s*=\s*[\"\'](.+?)[\"\']/) doc = Hpricot(result.body.to_s)
doc.search('a').each do |link|
links.each do |link| hr = link.attributes['href']
if hr
#links = result.body.to_s.scan(/href\s*=\s*[\"\'](.+?)[\"\']/)
#links.each do |link|
begin begin
uri = URI.parse(link[0]) uri = URI.parse(hr)
tssl = false tssl = false
if uri.scheme == "https" if uri.scheme == "https"
@ -23,6 +31,7 @@ class CrawlerBasic < BaseParser
if !uri.host or uri.host == nil if !uri.host or uri.host == nil
thost = request['rhost'] thost = request['rhost']
tssl = self.targetssl
else else
thost = uri.host thost = uri.host
end end
@ -40,12 +49,14 @@ class CrawlerBasic < BaseParser
end end
newp = Pathname.new(tpath) newp = Pathname.new(tpath)
if !newp.absolute?
oldp = Pathname.new(request['uri']) oldp = Pathname.new(request['uri'])
if !oldp.absolute?
if !newp.absolute?
newp = oldp + newp.cleanpath newp = oldp + newp.cleanpath
end end
end
hreq = { hreq = {
'rhost' => thost, 'rhost' => thost,
@ -57,7 +68,7 @@ class CrawlerBasic < BaseParser
'query' => uri.query 'query' => uri.query
} }
#puts "R: #{hreq['uri']}"
insertnewpath(hreq) insertnewpath(hreq)
rescue URI::InvalidURIError rescue URI::InvalidURIError
@ -67,4 +78,5 @@ class CrawlerBasic < BaseParser
end end
end end
end end
end