Uri limits added

git-svn-id: file:///home/svn/framework3/trunk@9031 4d416f70-5f16-0410-b530-b9f4589650da
unstable
et 2010-04-07 03:33:21 +00:00
parent 62ba505f10
commit 450a117894
1 changed files with 35 additions and 10 deletions

View File

@ -63,7 +63,15 @@ $proxyport = 8080
$cookiejar = {}
# Verbose
$verbose = false
$verbose = false
# Enable URI Limits
$enableul = true
# Maximum number of requests per URI (check $enableul)
$maxurilimit = 1
class HttpCrawler
attr_accessor :ctarget, :cport, :cinipath, :cssl, :proxyhost, :proxyport, :useproxy
@ -92,7 +100,8 @@ class HttpCrawler
@NotViewedQueue = Rinda::TupleSpace.new
@ViewedQueue = Hash.new
@ViewedQueue = Hash.new
@UriLimits = Hash.new
insertnewpath(inireq)
@ -182,18 +191,29 @@ class HttpCrawler
# a.push(Thread.new {
####
hashreq = @NotViewedQueue.take(reqfilter, $taketimeout)
hashreq = @NotViewedQueue.take(reqfilter, $taketimeout)
ul = false
if @UriLimits.include?(hashreq['uri']) and $enableul
#puts "Request #{@UriLimits[hashreq['uri']]}/#{$maxurilimit} #{hashreq['uri']}"
if @UriLimits[hashreq['uri']] >= $maxurilimit
#puts "URI LIMIT Reached: #{$maxurilimit} for uri #{hashreq['uri']}"
ul = true
end
else
@UriLimits[hashreq['uri']] = 0
end
if !@ViewedQueue.include?(hashsig(hashreq))
if !@ViewedQueue.include?(hashsig(hashreq)) and !ul
@ViewedQueue[hashsig(hashreq)] = Time.now
@ViewedQueue[hashsig(hashreq)] = Time.now
@UriLimits[hashreq['uri']] += 1
if !File.extname(hashreq['uri']).empty? and $dontcrawl.include? File.extname(hashreq['uri'])
if $verbose
puts "URI not crawled #{hashreq['uri']}"
end
else
else
prx = nil
if self.useproxy
@ -307,7 +327,7 @@ class HttpCrawler
@crawlermodules.each_key do |k|
@crawlermodules[k].parse(reqopts,resp)
end
when 301..302
when 301..303
puts "[#{resp.code}] Redirection to: #{resp['Location']}"
if $verbose
puts urltohash(resp['Location'])
@ -493,7 +513,12 @@ if $crun
puts "Database: #{$dbpathmsf}"
else
puts "[DATABASE DISABLED]"
end
end
if $enableul
puts "URI LIMITS ENABLED: #{$maxurilimit}"
end
puts "Target: #{mc.ctarget} Port: #{mc.cport} Path: #{mc.cinipath} SSL: #{mc.cssl}"
mc.run
end