metasploit-framework/modules/auxiliary/crawler/msfcrawler.rb

##
# This module requires Metasploit: https://metasploit.com/download
# Current source: https://github.com/rapid7/metasploit-framework
##

#
# Web Crawler.
#
# Author:  Efrain Torres   et [at] metasploit.com 2010
#
#

# openssl before rubygems mac os
require 'openssl'
require 'rinda/tuplespace'
require 'pathname'
require 'uri'

class MetasploitModule < Msf::Auxiliary
  include Msf::Auxiliary::Scanner
  include Msf::Auxiliary::Report

  def initialize(info = {})
    super(update_info(info,
      'Name'			=> 'Metasploit Web Crawler',
      'Description'       => 'This auxiliary module is a modular web crawler, to be used in conjunction with wmap (someday) or standalone.',
      'Author'			=> 'et',
      'License'			=> MSF_LICENSE
    ))

    register_options([
      OptString.new('PATH',	[true,	"Starting crawling path", '/']),
      OptInt.new('RPORT', [true, "Remote port", 80 ])
    ])

    register_advanced_options([
      OptPath.new('CrawlerModulesDir', [true,	'The base directory containing the crawler modules',
        File.join(Msf::Config.data_directory, "msfcrawler")
      ]),
      OptBool.new('EnableUl', [ false, "Enable maximum number of request per URI", true ]),
      OptBool.new('StoreDB', [ false, "Store requests in database", false ]),
      OptInt.new('MaxUriLimit', [ true, "Number max. request per URI", 10]),
      OptInt.new('SleepTime', [ true, "Sleep time (secs) between requests", 0]),
      OptInt.new('TakeTimeout', [ true, "Timeout for loop ending", 15]),
      OptInt.new('ReadTimeout', [ true, "Read timeout (-1 forever)", 3]),
      OptInt.new('ThreadNum', [ true, "Threads number", 20]),
      OptString.new('DontCrawl',	[true,	"Filestypes not to crawl", '.exe,.zip,.tar,.bz2,.run,.asc,.gz'])
    ])
  end

  attr_accessor :ctarget, :cport, :cssl

  def run
    i, a = 0, []

    self.ctarget = datastore['RHOSTS']
    self.cport = datastore['RPORT']
    self.cssl = datastore['SSL']
    inipath = datastore['PATH']

    cinipath = (inipath.nil? or inipath.empty?) ? '/' : inipath

    inireq = {
        'rhost'		=> ctarget,
        'rport'		=> cport,
        'uri' 		=> cinipath,
        'method'   	=> 'GET',
        'ctype'		=> 'text/plain',
        'ssl'		=> cssl,
        'query'		=> nil,
        'data'		=> nil
    }

    @NotViewedQueue = Rinda::TupleSpace.new
    @ViewedQueue = Hash.new
    @UriLimits = Hash.new
    @curent_site = self.ctarget

    insertnewpath(inireq)

    print_status("Loading modules: #{datastore['CrawlerModulesDir']}")
    load_modules(datastore['CrawlerModulesDir'])
    print_status("OK")

    if datastore['EnableUl']
      print_status("URI LIMITS ENABLED: #{datastore['MaxUriLimit']} (Maximum number of requests per uri)")
    end

    print_status("Target: #{self.ctarget} Port: #{self.cport} Path: #{cinipath} SSL: #{self.cssl}")


    begin
      reqfilter = reqtemplate(self.ctarget,self.cport,self.cssl)

      i =0

      loop do

        ####
        #if i <= datastore['ThreadNum']
        #	a.push(Thread.new {
        ####

        hashreq = @NotViewedQueue.take(reqfilter, datastore['TakeTimeout'])

        ul = false
        if @UriLimits.include?(hashreq['uri']) and datastore['EnableUl']
          #puts "Request #{@UriLimits[hashreq['uri']]}/#{$maxurilimit} #{hashreq['uri']}"
          if @UriLimits[hashreq['uri']] >= datastore['MaxUriLimit']
            #puts "URI LIMIT Reached: #{$maxurilimit} for uri #{hashreq['uri']}"
            ul = true
          end
        else
          @UriLimits[hashreq['uri']] = 0
        end

        if !@ViewedQueue.include?(hashsig(hashreq)) and !ul

          @ViewedQueue[hashsig(hashreq)] = Time.now
          @UriLimits[hashreq['uri']] += 1

          if !File.extname(hashreq['uri']).empty? and datastore['DontCrawl'].include? File.extname(hashreq['uri'])
            vprint_status "URI not crawled #{hashreq['uri']}"
          else
              prx = nil
              #if self.useproxy
              #	prx = "HTTP:"+self.proxyhost.to_s+":"+self.proxyport.to_s
              #end

              c = Rex::Proto::Http::Client.new(
                self.ctarget,
                self.cport.to_i,
                {},
                self.cssl,
                nil,
                prx
              )

              sendreq(c,hashreq)
          end
        else
          vprint_line "#{hashreq['uri']} already visited. "
        end

        ####
        #})

        #i += 1
        #else
        #	sleep(0.01) and a.delete_if {|x| not x.alive?} while not a.empty?
        #	i = 0
        #end
        ####

      end
    rescue Rinda::RequestExpiredError
      print_status("END.")
      return
    end

    print_status("Finished crawling")
  end

  def reqtemplate(target,port,ssl)
    hreq = {
      'rhost'		=> target,
      'rport'		=> port,
      'uri'  		=> nil,
      'method'   	=> nil,
      'ctype'		=> nil,
      'ssl'		=> ssl,
      'query'		=> nil,
      'data'		=> nil
    }

    return hreq
  end

  def storedb(hashreq,response,dbpath)

    # Added host/port/ssl for report_web_page support
    info = {
      :web_site => @current_site,
      :path     => hashreq['uri'],
      :query    => hashreq['query'],
      :host     => hashreq['rhost'],
      :port     => hashreq['rport'],
      :ssl      => !hashreq['ssl'].nil?,
      :data	    => hashreq['data'],
      :code     => response.code,
      :body     => response.body,
      :headers  => response.headers
    }

    #if response['content-type']
    #	info[:ctype] = response['content-type'][0]
    #end

    #if response['set-cookie']
    #	info[:cookie] = page.headers['set-cookie'].join("\n")
    #end

    #if page.headers['authorization']
    #	info[:auth] = page.headers['authorization'].join("\n")
    #end

    #if page.headers['location']
    #	info[:location] = page.headers['location'][0]
    #end

    #if page.headers['last-modified']
    #	info[:mtime] = page.headers['last-modified'][0]
    #end

    # Report the web page to the database
    report_web_page(info)
  end

  #
  # Modified version of load_protocols from psnuffle by Max Moser  <mmo@remote-exploit.org>
  #

  def load_modules(crawlermodulesdir)

    base = crawlermodulesdir
    if (not File.directory?(base))
      raise RuntimeError,"The Crawler modules parameter is set to an invalid directory"
    end

    @crawlermodules = {}
    cmodules = Dir.new(base).entries.grep(/\.rb$/).sort
    cmodules.each do |n|
      f = File.join(base, n)
      m = ::Module.new
      begin
        m.module_eval(File.read(f, File.size(f)))
        m.constants.grep(/^Crawler(.*)/) do
          cmod = $1
          klass = m.const_get("Crawler#{cmod}")
          @crawlermodules[cmod.downcase] = klass.new(self)

          print_status("Loaded crawler module #{cmod} from #{f}...")
        end
      rescue ::Exception => e
        print_error("Crawler module #{n} failed to load: #{e.class} #{e} #{e.backtrace}")
      end
    end
  end

  def sendreq(nclient,reqopts={})

    begin
      r = nclient.request_raw(reqopts)
      resp = nclient.send_recv(r, datastore['ReadTimeout'])

      if resp
        #
        # Quickfix for bug packet.rb to_s line: 190
        # In case modules or crawler calls to_s on de-chunked responses
        #
        resp.transfer_chunked = false

        if datastore['StoreDB']
          storedb(reqopts,resp,$dbpathmsf)
        end

        print_status ">> [#{resp.code}] #{reqopts['uri']}"

        if reqopts['query'] and !reqopts['query'].empty?
          print_status ">>> [Q] #{reqopts['query']}"
        end

        if reqopts['data']
          print_status ">>> [D] #{reqopts['data']}"
        end

        case resp.code
        when 200
          @crawlermodules.each_key do |k|
            @crawlermodules[k].parse(reqopts,resp)
          end
        when 301..303
          print_line("[#{resp.code}] Redirection to: #{resp['Location']}")
          vprint_status urltohash('GET',resp['Location'],reqopts['uri'],nil)
          insertnewpath(urltohash('GET',resp['Location'],reqopts['uri'],nil))
        when 404
          print_status "[404] Invalid link #{reqopts['uri']}"
        else
          print_status "Unhandled #{resp.code}"
        end

      else
        print_status "No response"
      end
      sleep(datastore['SleepTime'])
    rescue
      print_status "ERROR"
      vprint_status "#{$!}: #{$!.backtrace}"
    end
  end

  #
  # Add new path (uri) to test non-viewed queue
  #

  def insertnewpath(hashreq)

    hashreq['uri'] = canonicalize(hashreq['uri'])

    if hashreq['rhost'] == datastore['RHOSTS'] and hashreq['rport'] == datastore['RPORT']
      if !@ViewedQueue.include?(hashsig(hashreq))
        if @NotViewedQueue.read_all(hashreq).size > 0
          vprint_status "Already in queue to be viewed: #{hashreq['uri']}"
        else
          vprint_status "Inserted: #{hashreq['uri']}"

          @NotViewedQueue.write(hashreq)
        end
      else
        vprint_status "#{hashreq['uri']} already visited at #{@ViewedQueue[hashsig(hashreq)]}"
      end
    end
  end

  #
  # Build a new hash for a local path
  #

  def urltohash(m,url,basepath,dat)

      # m:   method
      # url: uri?[query]
      # basepath: base path/uri to determine absolute path when relative
      # data: body data, nil if GET and query = uri.query

      uri = URI.parse(url)
      uritargetssl = (uri.scheme == "https") ? true : false

      uritargethost = uri.host
      if (uri.host.nil? or uri.host.empty?)
        uritargethost = self.ctarget
        uritargetssl = self.cssl
      end

      uritargetport = uri.port
      if (uri.port.nil?)
        uritargetport = self.cport
      end

      uritargetpath = uri.path
      if (uri.path.nil? or uri.path.empty?)
        uritargetpath = "/"
      end

      newp = Pathname.new(uritargetpath)
      oldp = Pathname.new(basepath)
      if !newp.absolute?
        if oldp.to_s[-1,1] == '/'
          newp = oldp+newp
        else
          if !newp.to_s.empty?
            newp = File.join(oldp.dirname,newp)
          end
        end
      end

      hashreq = {
        'rhost'		=> uritargethost,
        'rport'		=> uritargetport,
        'uri' 		=> newp.to_s,
        'method'   	=> m,
        'ctype'		=> 'text/plain',
        'ssl'		=> uritargetssl,
        'query'		=> uri.query,
        'data'		=> nil
      }

      if m == 'GET' and !dat.nil?
        hashreq['query'] = dat
      else
        hashreq['data'] = dat
      end

      return hashreq
  end

  # Taken from http://www.ruby-forum.com/topic/140101 by  Rob Biedenharn
  def canonicalize(uri)

    u = uri.kind_of?(URI) ? uri : URI.parse(uri.to_s)
    u.normalize!
    newpath = u.path
    while newpath.gsub!(%r{([^/]+)/\.\./?}) { |match|
      $1 == '..' ? match : ''
    } do end
    newpath = newpath.gsub(%r{/\./}, '/').sub(%r{/\.\z}, '/')
    u.path = newpath
    # Ugly fix
    u.path = u.path.gsub("\/..\/","\/")
    u.to_s
  end

  def hashsig(hashreq)
    hashreq.to_s
  end
end

class BaseParser
  attr_accessor :crawler

  def initialize(c)
    self.crawler = c
  end

  def parse(request,result)
    nil
  end

  #
  # Add new path (uri) to test hash queue
  #
  def insertnewpath(hashreq)
    self.crawler.insertnewpath(hashreq)
  end

  def hashsig(hashreq)
    self.crawler.hashsig(hashreq)
  end

  def urltohash(m,url,basepath,dat)
    self.crawler.urltohash(m,url,basepath,dat)
  end

  def targetssl
    self.crawler.cssl
  end

  def targetport
    self.crawler.cport
  end

  def targethost
    self.crawler.ctarget
  end

  def targetinipath
    self.crawler.cinipath
  end
end
Fix file header comment [See #1555] 2013-03-07 23:53:19 +00:00			`##`
use https for metaploit.com links 2017-07-24 13:26:21 +00:00			`# This module requires Metasploit: https://metasploit.com/download`
Redo the boilerplate / splat [SeeRM #8496] 2013-10-15 18:50:46 +00:00			`# Current source: https://github.com/rapid7/metasploit-framework`
Fix file header comment [See #1555] 2013-03-07 23:53:19 +00:00			`##`

first try on modular crawling git-svn-id: file:///home/svn/framework3/trunk@10915 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-05 04:00:49 +00:00			`#`
			`# Web Crawler.`
			`#`
			`# Author: Efrain Torres et [at] metasploit.com 2010`
			`#`
			`#`

			`# openssl before rubygems mac os`
			`require 'openssl'`
			`require 'rinda/tuplespace'`
			`require 'pathname'`
			`require 'uri'`

use MetasploitModule as a class name 2016-03-08 13:02:44 +00:00			`class MetasploitModule < Msf::Auxiliary`
Retab modules 2013-08-30 21:28:54 +00:00			`include Msf::Auxiliary::Scanner`
			`include Msf::Auxiliary::Report`

			`def initialize(info = {})`
			`super(update_info(info,`
			`'Name' => 'Metasploit Web Crawler',`
first round of spelling/grammar fixes 2017-08-25 01:38:44 +00:00			`'Description' => 'This auxiliary module is a modular web crawler, to be used in conjunction with wmap (someday) or standalone.',`
Retab modules 2013-08-30 21:28:54 +00:00			`'Author' => 'et',`
			`'License' => MSF_LICENSE`
			`))`

			`register_options([`
			`OptString.new('PATH', [true, "Starting crawling path", '/']),`
Extra commas. 2015-01-22 19:45:08 +00:00			`OptInt.new('RPORT', [true, "Remote port", 80 ])`
Fix msf/core and self.class msftidy warnings Also fixed rex requires. 2017-05-03 20:42:21 +00:00			`])`
Retab modules 2013-08-30 21:28:54 +00:00
			`register_advanced_options([`
			`OptPath.new('CrawlerModulesDir', [true, 'The base directory containing the crawler modules',`
Find and replace 2013-09-26 19:34:48 +00:00			`File.join(Msf::Config.data_directory, "msfcrawler")`
Retab modules 2013-08-30 21:28:54 +00:00			`]),`
			`OptBool.new('EnableUl', [ false, "Enable maximum number of request per URI", true ]),`
			`OptBool.new('StoreDB', [ false, "Store requests in database", false ]),`
			`OptInt.new('MaxUriLimit', [ true, "Number max. request per URI", 10]),`
			`OptInt.new('SleepTime', [ true, "Sleep time (secs) between requests", 0]),`
			`OptInt.new('TakeTimeout', [ true, "Timeout for loop ending", 15]),`
			`OptInt.new('ReadTimeout', [ true, "Read timeout (-1 forever)", 3]),`
			`OptInt.new('ThreadNum', [ true, "Threads number", 20]),`
Extra commas. 2015-01-22 19:45:08 +00:00			`OptString.new('DontCrawl', [true, "Filestypes not to crawl", '.exe,.zip,.tar,.bz2,.run,.asc,.gz'])`
Fix msf/core and self.class msftidy warnings Also fixed rex requires. 2017-05-03 20:42:21 +00:00			`])`
Retab modules 2013-08-30 21:28:54 +00:00			`end`

			`attr_accessor :ctarget, :cport, :cssl`

			`def run`
			`i, a = 0, []`

			`self.ctarget = datastore['RHOSTS']`
			`self.cport = datastore['RPORT']`
			`self.cssl = datastore['SSL']`
			`inipath = datastore['PATH']`

			`cinipath = (inipath.nil? or inipath.empty?) ? '/' : inipath`

			`inireq = {`
			`'rhost' => ctarget,`
			`'rport' => cport,`
			`'uri' => cinipath,`
			`'method' => 'GET',`
			`'ctype' => 'text/plain',`
			`'ssl' => cssl,`
			`'query' => nil,`
			`'data' => nil`
			`}`

			`@NotViewedQueue = Rinda::TupleSpace.new`
			`@ViewedQueue = Hash.new`
			`@UriLimits = Hash.new`
			`@curent_site = self.ctarget`

			`insertnewpath(inireq)`

			`print_status("Loading modules: #{datastore['CrawlerModulesDir']}")`
			`load_modules(datastore['CrawlerModulesDir'])`
			`print_status("OK")`

			`if datastore['EnableUl']`
			`print_status("URI LIMITS ENABLED: #{datastore['MaxUriLimit']} (Maximum number of requests per uri)")`
			`end`

			`print_status("Target: #{self.ctarget} Port: #{self.cport} Path: #{cinipath} SSL: #{self.cssl}")`


			`begin`
			`reqfilter = reqtemplate(self.ctarget,self.cport,self.cssl)`

			`i =0`

			`loop do`

			`####`
			`#if i <= datastore['ThreadNum']`
			`# a.push(Thread.new {`
			`####`

			`hashreq = @NotViewedQueue.take(reqfilter, datastore['TakeTimeout'])`

			`ul = false`
			`if @UriLimits.include?(hashreq['uri']) and datastore['EnableUl']`
			`#puts "Request #{@UriLimits[hashreq['uri']]}/#{$maxurilimit} #{hashreq['uri']}"`
			`if @UriLimits[hashreq['uri']] >= datastore['MaxUriLimit']`
			`#puts "URI LIMIT Reached: #{$maxurilimit} for uri #{hashreq['uri']}"`
			`ul = true`
			`end`
			`else`
			`@UriLimits[hashreq['uri']] = 0`
			`end`

			`if !@ViewedQueue.include?(hashsig(hashreq)) and !ul`

			`@ViewedQueue[hashsig(hashreq)] = Time.now`
			`@UriLimits[hashreq['uri']] += 1`

			`if !File.extname(hashreq['uri']).empty? and datastore['DontCrawl'].include? File.extname(hashreq['uri'])`
			`vprint_status "URI not crawled #{hashreq['uri']}"`
			`else`
			`prx = nil`
			`#if self.useproxy`
			`# prx = "HTTP:"+self.proxyhost.to_s+":"+self.proxyport.to_s`
			`#end`

			`c = Rex::Proto::Http::Client.new(`
			`self.ctarget,`
			`self.cport.to_i,`
			`{},`
			`self.cssl,`
			`nil,`
			`prx`
			`)`

			`sendreq(c,hashreq)`
			`end`
			`else`
			`vprint_line "#{hashreq['uri']} already visited. "`
			`end`

			`####`
			`#})`

			`#i += 1`
			`#else`
			`# sleep(0.01) and a.delete_if {\|x\| not x.alive?} while not a.empty?`
			`# i = 0`
			`#end`
			`####`

			`end`
			`rescue Rinda::RequestExpiredError`
			`print_status("END.")`
			`return`
			`end`

			`print_status("Finished crawling")`
			`end`

			`def reqtemplate(target,port,ssl)`
			`hreq = {`
			`'rhost' => target,`
			`'rport' => port,`
			`'uri' => nil,`
			`'method' => nil,`
			`'ctype' => nil,`
			`'ssl' => ssl,`
			`'query' => nil,`
			`'data' => nil`
			`}`

			`return hreq`
			`end`

			`def storedb(hashreq,response,dbpath)`

Patching storedb function (adding host/port/ssl for correct report_web_page) 2017-03-13 16:37:47 +00:00			`# Added host/port/ssl for report_web_page support`
Retab modules 2013-08-30 21:28:54 +00:00			`info = {`
			`:web_site => @current_site,`
			`:path => hashreq['uri'],`
			`:query => hashreq['query'],`
Patching storedb function (adding host/port/ssl for correct report_web_page) 2017-03-13 16:37:47 +00:00			`:host => hashreq['rhost'],`
			`:port => hashreq['rport'],`
			`:ssl => !hashreq['ssl'].nil?,`
			`:data => hashreq['data'],`
			`:code => response.code,`
			`:body => response.body,`
			`:headers => response.headers`
Retab modules 2013-08-30 21:28:54 +00:00			`}`

			`#if response['content-type']`
			`# info[:ctype] = response['content-type'][0]`
			`#end`

			`#if response['set-cookie']`
			`# info[:cookie] = page.headers['set-cookie'].join("\n")`
			`#end`

			`#if page.headers['authorization']`
			`# info[:auth] = page.headers['authorization'].join("\n")`
			`#end`

			`#if page.headers['location']`
			`# info[:location] = page.headers['location'][0]`
			`#end`

			`#if page.headers['last-modified']`
			`# info[:mtime] = page.headers['last-modified'][0]`
			`#end`

			`# Report the web page to the database`
			`report_web_page(info)`
			`end`

			`#`
			`# Modified version of load_protocols from psnuffle by Max Moser <mmo@remote-exploit.org>`
			`#`

			`def load_modules(crawlermodulesdir)`

			`base = crawlermodulesdir`
			`if (not File.directory?(base))`
			`raise RuntimeError,"The Crawler modules parameter is set to an invalid directory"`
			`end`

			`@crawlermodules = {}`
			`cmodules = Dir.new(base).entries.grep(/\.rb$/).sort`
			`cmodules.each do \|n\|`
			`f = File.join(base, n)`
			`m = ::Module.new`
			`begin`
			`m.module_eval(File.read(f, File.size(f)))`
			`m.constants.grep(/^Crawler(.*)/) do`
			`cmod = $1`
			`klass = m.const_get("Crawler#{cmod}")`
			`@crawlermodules[cmod.downcase] = klass.new(self)`

			`print_status("Loaded crawler module #{cmod} from #{f}...")`
			`end`
			`rescue ::Exception => e`
			`print_error("Crawler module #{n} failed to load: #{e.class} #{e} #{e.backtrace}")`
			`end`
			`end`
			`end`

			`def sendreq(nclient,reqopts={})`

			`begin`
			`r = nclient.request_raw(reqopts)`
			`resp = nclient.send_recv(r, datastore['ReadTimeout'])`

			`if resp`
			`#`
			`# Quickfix for bug packet.rb to_s line: 190`
			`# In case modules or crawler calls to_s on de-chunked responses`
			`#`
			`resp.transfer_chunked = false`

			`if datastore['StoreDB']`
			`storedb(reqopts,resp,$dbpathmsf)`
			`end`

			`print_status ">> [#{resp.code}] #{reqopts['uri']}"`

			`if reqopts['query'] and !reqopts['query'].empty?`
			`print_status ">>> [Q] #{reqopts['query']}"`
			`end`

			`if reqopts['data']`
			`print_status ">>> [D] #{reqopts['data']}"`
			`end`

			`case resp.code`
			`when 200`
			`@crawlermodules.each_key do \|k\|`
			`@crawlermodules[k].parse(reqopts,resp)`
			`end`
			`when 301..303`
			`print_line("[#{resp.code}] Redirection to: #{resp['Location']}")`
			`vprint_status urltohash('GET',resp['Location'],reqopts['uri'],nil)`
			`insertnewpath(urltohash('GET',resp['Location'],reqopts['uri'],nil))`
			`when 404`
			`print_status "[404] Invalid link #{reqopts['uri']}"`
			`else`
			`print_status "Unhandled #{resp.code}"`
			`end`

			`else`
			`print_status "No response"`
			`end`
			`sleep(datastore['SleepTime'])`
			`rescue`
			`print_status "ERROR"`
			`vprint_status "#{$!}: #{$!.backtrace}"`
			`end`
			`end`

			`#`
			`# Add new path (uri) to test non-viewed queue`
			`#`

			`def insertnewpath(hashreq)`

			`hashreq['uri'] = canonicalize(hashreq['uri'])`

			`if hashreq['rhost'] == datastore['RHOSTS'] and hashreq['rport'] == datastore['RPORT']`
			`if !@ViewedQueue.include?(hashsig(hashreq))`
			`if @NotViewedQueue.read_all(hashreq).size > 0`
			`vprint_status "Already in queue to be viewed: #{hashreq['uri']}"`
			`else`
			`vprint_status "Inserted: #{hashreq['uri']}"`

			`@NotViewedQueue.write(hashreq)`
			`end`
			`else`
			`vprint_status "#{hashreq['uri']} already visited at #{@ViewedQueue[hashsig(hashreq)]}"`
			`end`
			`end`
			`end`

			`#`
			`# Build a new hash for a local path`
			`#`

			`def urltohash(m,url,basepath,dat)`

			`# m: method`
			`# url: uri?[query]`
			`# basepath: base path/uri to determine absolute path when relative`
			`# data: body data, nil if GET and query = uri.query`

			`uri = URI.parse(url)`
			`uritargetssl = (uri.scheme == "https") ? true : false`

			`uritargethost = uri.host`
			`if (uri.host.nil? or uri.host.empty?)`
			`uritargethost = self.ctarget`
			`uritargetssl = self.cssl`
			`end`

			`uritargetport = uri.port`
			`if (uri.port.nil?)`
			`uritargetport = self.cport`
			`end`

			`uritargetpath = uri.path`
			`if (uri.path.nil? or uri.path.empty?)`
			`uritargetpath = "/"`
			`end`

			`newp = Pathname.new(uritargetpath)`
			`oldp = Pathname.new(basepath)`
			`if !newp.absolute?`
			`if oldp.to_s[-1,1] == '/'`
			`newp = oldp+newp`
			`else`
			`if !newp.to_s.empty?`
			`newp = File.join(oldp.dirname,newp)`
			`end`
			`end`
			`end`

			`hashreq = {`
			`'rhost' => uritargethost,`
			`'rport' => uritargetport,`
			`'uri' => newp.to_s,`
			`'method' => m,`
			`'ctype' => 'text/plain',`
			`'ssl' => uritargetssl,`
			`'query' => uri.query,`
			`'data' => nil`
			`}`

			`if m == 'GET' and !dat.nil?`
			`hashreq['query'] = dat`
			`else`
			`hashreq['data'] = dat`
			`end`

			`return hashreq`
			`end`

			`# Taken from http://www.ruby-forum.com/topic/140101 by Rob Biedenharn`
			`def canonicalize(uri)`

			`u = uri.kind_of?(URI) ? uri : URI.parse(uri.to_s)`
			`u.normalize!`
			`newpath = u.path`
			`while newpath.gsub!(%r{([^/]+)/\.\./?}) { \|match\|`
			`$1 == '..' ? match : ''`
			`} do end`
			`newpath = newpath.gsub(%r{/\./}, '/').sub(%r{/\.\z}, '/')`
			`u.path = newpath`
			`# Ugly fix`
			`u.path = u.path.gsub("\/..\/","\/")`
			`u.to_s`
			`end`

			`def hashsig(hashreq)`
			`hashreq.to_s`
			`end`
first try on modular crawling git-svn-id: file:///home/svn/framework3/trunk@10915 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-05 04:00:49 +00:00			`end`

			`class BaseParser`
Retab modules 2013-08-30 21:28:54 +00:00			`attr_accessor :crawler`

			`def initialize(c)`
			`self.crawler = c`
			`end`

			`def parse(request,result)`
			`nil`
			`end`

			`#`
			`# Add new path (uri) to test hash queue`
			`#`
			`def insertnewpath(hashreq)`
			`self.crawler.insertnewpath(hashreq)`
			`end`

			`def hashsig(hashreq)`
			`self.crawler.hashsig(hashreq)`
			`end`

			`def urltohash(m,url,basepath,dat)`
			`self.crawler.urltohash(m,url,basepath,dat)`
			`end`

			`def targetssl`
			`self.crawler.cssl`
			`end`

			`def targetport`
			`self.crawler.cport`
			`end`

			`def targethost`
			`self.crawler.ctarget`
			`end`

			`def targetinipath`
			`self.crawler.cinipath`
			`end`
first try on modular crawling git-svn-id: file:///home/svn/framework3/trunk@10915 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-05 04:00:49 +00:00			`end`