From 450a11789434d5692a77ae5ecb29938c98ca8e89 Mon Sep 17 00:00:00 2001 From: et <> Date: Wed, 7 Apr 2010 03:33:21 +0000 Subject: [PATCH] Uri limits added git-svn-id: file:///home/svn/framework3/trunk@9031 4d416f70-5f16-0410-b530-b9f4589650da --- tools/msfcrawler.rb | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/tools/msfcrawler.rb b/tools/msfcrawler.rb index 5eeb363e4b..91f51e79a7 100755 --- a/tools/msfcrawler.rb +++ b/tools/msfcrawler.rb @@ -63,7 +63,15 @@ $proxyport = 8080 $cookiejar = {} # Verbose -$verbose = false +$verbose = false + +# Enable URI Limits +$enableul = true + +# Maximum number of requests per URI (check $enableul) +$maxurilimit = 1 + + class HttpCrawler attr_accessor :ctarget, :cport, :cinipath, :cssl, :proxyhost, :proxyport, :useproxy @@ -92,7 +100,8 @@ class HttpCrawler @NotViewedQueue = Rinda::TupleSpace.new - @ViewedQueue = Hash.new + @ViewedQueue = Hash.new + @UriLimits = Hash.new insertnewpath(inireq) @@ -182,18 +191,29 @@ class HttpCrawler # a.push(Thread.new { #### - hashreq = @NotViewedQueue.take(reqfilter, $taketimeout) + hashreq = @NotViewedQueue.take(reqfilter, $taketimeout) + + ul = false + if @UriLimits.include?(hashreq['uri']) and $enableul + #puts "Request #{@UriLimits[hashreq['uri']]}/#{$maxurilimit} #{hashreq['uri']}" + if @UriLimits[hashreq['uri']] >= $maxurilimit + #puts "URI LIMIT Reached: #{$maxurilimit} for uri #{hashreq['uri']}" + ul = true + end + else + @UriLimits[hashreq['uri']] = 0 + end - if !@ViewedQueue.include?(hashsig(hashreq)) + if !@ViewedQueue.include?(hashsig(hashreq)) and !ul - @ViewedQueue[hashsig(hashreq)] = Time.now - + @ViewedQueue[hashsig(hashreq)] = Time.now + @UriLimits[hashreq['uri']] += 1 + if !File.extname(hashreq['uri']).empty? and $dontcrawl.include? File.extname(hashreq['uri']) if $verbose puts "URI not crawled #{hashreq['uri']}" end - else - + else prx = nil if self.useproxy @@ -307,7 +327,7 @@ class HttpCrawler @crawlermodules.each_key do |k| @crawlermodules[k].parse(reqopts,resp) end - when 301..302 + when 301..303 puts "[#{resp.code}] Redirection to: #{resp['Location']}" if $verbose puts urltohash(resp['Location']) @@ -493,7 +513,12 @@ if $crun puts "Database: #{$dbpathmsf}" else puts "[DATABASE DISABLED]" - end + end + + if $enableul + puts "URI LIMITS ENABLED: #{$maxurilimit}" + end + puts "Target: #{mc.ctarget} Port: #{mc.cport} Path: #{mc.cinipath} SSL: #{mc.cssl}" mc.run end