Uri limits added
git-svn-id: file:///home/svn/framework3/trunk@9031 4d416f70-5f16-0410-b530-b9f4589650daunstable
parent
62ba505f10
commit
450a117894
|
@ -63,7 +63,15 @@ $proxyport = 8080
|
|||
$cookiejar = {}
|
||||
|
||||
# Verbose
|
||||
$verbose = false
|
||||
$verbose = false
|
||||
|
||||
# Enable URI Limits
|
||||
$enableul = true
|
||||
|
||||
# Maximum number of requests per URI (check $enableul)
|
||||
$maxurilimit = 1
|
||||
|
||||
|
||||
|
||||
class HttpCrawler
|
||||
attr_accessor :ctarget, :cport, :cinipath, :cssl, :proxyhost, :proxyport, :useproxy
|
||||
|
@ -92,7 +100,8 @@ class HttpCrawler
|
|||
|
||||
|
||||
@NotViewedQueue = Rinda::TupleSpace.new
|
||||
@ViewedQueue = Hash.new
|
||||
@ViewedQueue = Hash.new
|
||||
@UriLimits = Hash.new
|
||||
|
||||
insertnewpath(inireq)
|
||||
|
||||
|
@ -182,18 +191,29 @@ class HttpCrawler
|
|||
# a.push(Thread.new {
|
||||
####
|
||||
|
||||
hashreq = @NotViewedQueue.take(reqfilter, $taketimeout)
|
||||
hashreq = @NotViewedQueue.take(reqfilter, $taketimeout)
|
||||
|
||||
ul = false
|
||||
if @UriLimits.include?(hashreq['uri']) and $enableul
|
||||
#puts "Request #{@UriLimits[hashreq['uri']]}/#{$maxurilimit} #{hashreq['uri']}"
|
||||
if @UriLimits[hashreq['uri']] >= $maxurilimit
|
||||
#puts "URI LIMIT Reached: #{$maxurilimit} for uri #{hashreq['uri']}"
|
||||
ul = true
|
||||
end
|
||||
else
|
||||
@UriLimits[hashreq['uri']] = 0
|
||||
end
|
||||
|
||||
if !@ViewedQueue.include?(hashsig(hashreq))
|
||||
if !@ViewedQueue.include?(hashsig(hashreq)) and !ul
|
||||
|
||||
@ViewedQueue[hashsig(hashreq)] = Time.now
|
||||
|
||||
@ViewedQueue[hashsig(hashreq)] = Time.now
|
||||
@UriLimits[hashreq['uri']] += 1
|
||||
|
||||
if !File.extname(hashreq['uri']).empty? and $dontcrawl.include? File.extname(hashreq['uri'])
|
||||
if $verbose
|
||||
puts "URI not crawled #{hashreq['uri']}"
|
||||
end
|
||||
else
|
||||
|
||||
else
|
||||
|
||||
prx = nil
|
||||
if self.useproxy
|
||||
|
@ -307,7 +327,7 @@ class HttpCrawler
|
|||
@crawlermodules.each_key do |k|
|
||||
@crawlermodules[k].parse(reqopts,resp)
|
||||
end
|
||||
when 301..302
|
||||
when 301..303
|
||||
puts "[#{resp.code}] Redirection to: #{resp['Location']}"
|
||||
if $verbose
|
||||
puts urltohash(resp['Location'])
|
||||
|
@ -493,7 +513,12 @@ if $crun
|
|||
puts "Database: #{$dbpathmsf}"
|
||||
else
|
||||
puts "[DATABASE DISABLED]"
|
||||
end
|
||||
end
|
||||
|
||||
if $enableul
|
||||
puts "URI LIMITS ENABLED: #{$maxurilimit}"
|
||||
end
|
||||
|
||||
puts "Target: #{mc.ctarget} Port: #{mc.cport} Path: #{mc.cinipath} SSL: #{mc.cssl}"
|
||||
mc.run
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue