metasploit-framework/lib/anemone/page.rb

require 'nokogiri'
require 'ostruct'
require 'webrick/cookie'

module Anemone

  # Path extractor container namespace.
  module Extractors
    class Base
      attr_reader :page

      def initialize( page )
        @page = page
      end

      def doc
        page.doc
      end
    end
  end

  class Page

    # The URL of the page
    attr_reader :url
    # The raw HTTP response body of the page
    attr_reader :body
    # Headers of the HTTP response
    attr_reader :headers
    # URL of the page this one redirected to, if any
    attr_reader :redirect_to
    # Exception object, if one was raised during HTTP#fetch_page
    attr_reader :error

    # OpenStruct for user-stored data
    attr_accessor :data
    # Integer response code of the page
    attr_accessor :code
    # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
    attr_accessor :visited
    # Depth of this page from the root of the crawl. This is not necessarily the
    # shortest path; use PageStore#shortest_paths! to find that value.
    attr_accessor :depth
    # URL of the page that brought us to this page
    attr_accessor :referer
    # Response time of the request for this page in milliseconds
    attr_accessor :response_time
    # Storage for the original HTTP request that generated this response
    attr_accessor :request

    #
    # Create a new page
    #
    def initialize(url, params = {})
      @url = url
      @data = OpenStruct.new

      @code = params[:code]
      @headers = params[:headers] || {}
      @headers['content-type'] ||= ['']
      @aliases = Array(params[:aka]).compact
      @referer = params[:referer]
      @depth = params[:depth] || 0
      @redirect_to = to_absolute(params[:redirect_to])
      @response_time = params[:response_time]
      @body = params[:body]
      @error = params[:error]

      @fetched = !params[:code].nil?
    end

    def self.extractors
      return @extractors if @extractors

      lib = File.dirname( __FILE__ ) + '/extractors/*.rb'
      Dir.glob( lib ).each { |e| require e }

      @extractors = Extractors.constants.map do |e|
          next if e == :Base
          Extractors.const_get( e )
      end.compact
    end

    def run_extractors
      return [] if !doc
      self.class.extractors.map { |e| e.new( self ).run rescue next }.flatten.
          compact.map do |p|
              abs = to_absolute( URI( p ) ) rescue next
              !in_domain?( abs ) ? nil : abs
          end.compact.uniq
    end

    #
    # Array of distinct A tag HREFs from the page
    #
     # MODIFIED: Dig URLs from elements other than "A" refs
     #
    def links
      @links ||= run_extractors
    end

    #
    # Nokogiri document for the HTML body
    #
    def doc
      return @doc if @doc
      @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
    end

    #
    # Delete the Nokogiri document and response body to conserve memory
    #
    def discard_doc!
      links # force parsing of page links before we trash the document
      @doc = @body = nil
    end

    #
    # Was the page successfully fetched?
    # +true+ if the page was fetched with no error, +false+ otherwise.
    #
    def fetched?
      @fetched
    end

    #
    # Array of cookies received with this page as WEBrick::Cookie objects.
    #
    def cookies
      WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
    end

    #
    # The content-type returned by the HTTP request for this page
    #
    def content_type
      res = headers['content-type']
      res = res.first if res.kind_of?(::Array)
      res
    end

    #
    # Returns +true+ if the page is a HTML document, returns +false+
    # otherwise.
    #
    def html?
      !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
    end

    #
    # Returns +true+ if the page is a HTTP redirect, returns +false+
    # otherwise.
    #
    def redirect?
      (300..307).include?(@code)
    end

    #
    # Returns +true+ if the page was not found (returned 404 code),
    # returns +false+ otherwise.
    #
    def not_found?
      404 == @code
    end

    #
    # Converts relative URL *link* into an absolute URL based on the
    # location of the page
    #
    def to_absolute(link)
      return nil if link.nil?

      # remove anchor
      link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))

      relative = URI(link)
      absolute = @url.merge(relative)

      absolute.path = '/' if absolute.path.empty?

      return absolute
    end

    #
    # Returns +true+ if *uri* is in the same domain as the page, returns
    # +false+ otherwise
    #
    def in_domain?(uri)
      uri.host == @url.host
    end

    def marshal_dump
      [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
    end

    def marshal_load(ary)
      @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
    end

    def to_hash
      {'url' => @url.to_s,
       'headers' => Marshal.dump(@headers),
       'data' => Marshal.dump(@data),
       'body' => @body,
       'links' => links.map(&:to_s),
       'code' => @code,
       'visited' => @visited,
       'depth' => @depth,
       'referer' => @referer.to_s,
       'redirect_to' => @redirect_to.to_s,
       'response_time' => @response_time,
       'fetched' => @fetched}
    end

    def self.from_hash(hash)
      page = self.new(URI(hash['url']))
      {'@headers' => Marshal.load(hash['headers']),
       '@data' => Marshal.load(hash['data']),
       '@body' => hash['body'],
       '@links' => hash['links'].map { |link| URI(link) },
       '@code' => hash['code'].to_i,
       '@visited' => hash['visited'],
       '@depth' => hash['depth'].to_i,
       '@referer' => hash['referer'],
       '@redirect_to' => URI(hash['redirect_to']),
       '@response_time' => hash['response_time'].to_i,
       '@fetched' => hash['fetched']
      }.each do |var, value|
        page.instance_variable_set(var, value)
      end
      page
    end

    def dup
	  Marshal.load( Marshal.dump( self ) )
    end

  end
end
Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`require 'nokogiri'`
			`require 'ostruct'`
			`require 'webrick/cookie'`

			`module Anemone`
Updated support for Web modules and analysis techniques (committing to new clean branch due to corruption) 2012-11-01 19:14:38 +00:00
			`# Path extractor container namespace.`
			`module Extractors`
			`class Base`
			`attr_reader :page`

			`def initialize( page )`
			`@page = page`
			`end`

			`def doc`
			`page.doc`
			`end`
			`end`
			`end`

Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`class Page`

			`# The URL of the page`
			`attr_reader :url`
			`# The raw HTTP response body of the page`
			`attr_reader :body`
			`# Headers of the HTTP response`
			`attr_reader :headers`
			`# URL of the page this one redirected to, if any`
			`attr_reader :redirect_to`
			`# Exception object, if one was raised during HTTP#fetch_page`
			`attr_reader :error`

			`# OpenStruct for user-stored data`
			`attr_accessor :data`
			`# Integer response code of the page`
			`attr_accessor :code`
			`# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!`
			`attr_accessor :visited`
			`# Depth of this page from the root of the crawl. This is not necessarily the`
			`# shortest path; use PageStore#shortest_paths! to find that value.`
			`attr_accessor :depth`
			`# URL of the page that brought us to this page`
			`attr_accessor :referer`
			`# Response time of the request for this page in milliseconds`
			`attr_accessor :response_time`
Store the http request in the http response object when Rex::HTTP::Client.send_recv() is used. Let Anemone store the request as well, passing it into the Page object for further analysis. git-svn-id: file:///home/svn/framework3/trunk@11023 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-13 05:25:39 +00:00			`# Storage for the original HTTP request that generated this response`
			`attr_accessor :request`
Updated support for Web modules and analysis techniques (committing to new clean branch due to corruption) 2012-11-01 19:14:38 +00:00
Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`#`
			`# Create a new page`
			`#`
			`def initialize(url, params = {})`
			`@url = url`
			`@data = OpenStruct.new`

			`@code = params[:code]`
			`@headers = params[:headers] \|\| {}`
			`@headers['content-type'] \|\|= ['']`
			`@aliases = Array(params[:aka]).compact`
			`@referer = params[:referer]`
			`@depth = params[:depth] \|\| 0`
			`@redirect_to = to_absolute(params[:redirect_to])`
			`@response_time = params[:response_time]`
			`@body = params[:body]`
			`@error = params[:error]`

			`@fetched = !params[:code].nil?`
			`end`

Updated support for Web modules and analysis techniques (committing to new clean branch due to corruption) 2012-11-01 19:14:38 +00:00			`def self.extractors`
			`return @extractors if @extractors`

			`lib = File.dirname( __FILE__ ) + '/extractors/*.rb'`
			`Dir.glob( lib ).each { \|e\| require e }`

			`@extractors = Extractors.constants.map do \|e\|`
			`next if e == :Base`
			`Extractors.const_get( e )`
			`end.compact`
			`end`

			`def run_extractors`
			`return [] if !doc`
			`self.class.extractors.map { \|e\| e.new( self ).run rescue next }.flatten.`
			`compact.map do \|p\|`
			`abs = to_absolute( URI( p ) ) rescue next`
			`!in_domain?( abs ) ? nil : abs`
			`end.compact.uniq`
			`end`

Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`#`
			`# Array of distinct A tag HREFs from the page`
			`#`
Updated support for Web modules and analysis techniques (committing to new clean branch due to corruption) 2012-11-01 19:14:38 +00:00			`# MODIFIED: Dig URLs from elements other than "A" refs`
			`#`
Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`def links`
Anemone::Page#links: removed upwards dir traversal [Finishes #47241427] 2013-04-01 21:49:40 +00:00			`@links \|\|= run_extractors`
Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`end`

			`#`
			`# Nokogiri document for the HTML body`
			`#`
			`def doc`
			`return @doc if @doc`
			`@doc = Nokogiri::HTML(@body) if @body && html? rescue nil`
			`end`

			`#`
			`# Delete the Nokogiri document and response body to conserve memory`
			`#`
			`def discard_doc!`
			`links # force parsing of page links before we trash the document`
			`@doc = @body = nil`
			`end`

			`#`
			`# Was the page successfully fetched?`
			`# +true+ if the page was fetched with no error, +false+ otherwise.`
			`#`
			`def fetched?`
			`@fetched`
			`end`

			`#`
			`# Array of cookies received with this page as WEBrick::Cookie objects.`
			`#`
			`def cookies`
			`WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []`
			`end`

			`#`
			`# The content-type returned by the HTTP request for this page`
			`#`
			`def content_type`
This merges in support for Rex sockets to anemone git-svn-id: file:///home/svn/framework3/trunk@11016 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-13 03:33:36 +00:00			`res = headers['content-type']`
A working Rex backend for Anemone git-svn-id: file:///home/svn/framework3/trunk@11022 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-13 05:20:51 +00:00			`res = res.first if res.kind_of?(::Array)`
This merges in support for Rex sockets to anemone git-svn-id: file:///home/svn/framework3/trunk@11016 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-13 03:33:36 +00:00			`res`
Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`end`

			`#`
			`# Returns +true+ if the page is a HTML document, returns +false+`
			`# otherwise.`
			`#`
			`def html?`
			`!!(content_type =~ %r{^(text/html\|application/xhtml+xml)\b})`
			`end`

			`#`
			`# Returns +true+ if the page is a HTTP redirect, returns +false+`
			`# otherwise.`
			`#`
			`def redirect?`
			`(300..307).include?(@code)`
			`end`

			`#`
			`# Returns +true+ if the page was not found (returned 404 code),`
			`# returns +false+ otherwise.`
			`#`
			`def not_found?`
			`404 == @code`
			`end`

			`#`
			`# Converts relative URL link into an absolute URL based on the`
			`# location of the page`
			`#`
			`def to_absolute(link)`
			`return nil if link.nil?`

			`# remove anchor`
			`link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))`

			`relative = URI(link)`
			`absolute = @url.merge(relative)`

			`absolute.path = '/' if absolute.path.empty?`

			`return absolute`
			`end`

			`#`
			`# Returns +true+ if uri is in the same domain as the page, returns`
			`# +false+ otherwise`
			`#`
			`def in_domain?(uri)`
			`uri.host == @url.host`
			`end`

			`def marshal_dump`
			`[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]`
			`end`

			`def marshal_load(ary)`
			`@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary`
			`end`

			`def to_hash`
			`{'url' => @url.to_s,`
			`'headers' => Marshal.dump(@headers),`
			`'data' => Marshal.dump(@data),`
			`'body' => @body,`
Updated support for Web modules and analysis techniques (committing to new clean branch due to corruption) 2012-11-01 19:14:38 +00:00			`'links' => links.map(&:to_s),`
Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`'code' => @code,`
			`'visited' => @visited,`
			`'depth' => @depth,`
			`'referer' => @referer.to_s,`
			`'redirect_to' => @redirect_to.to_s,`
			`'response_time' => @response_time,`
			`'fetched' => @fetched}`
			`end`

			`def self.from_hash(hash)`
			`page = self.new(URI(hash['url']))`
			`{'@headers' => Marshal.load(hash['headers']),`
			`'@data' => Marshal.load(hash['data']),`
			`'@body' => hash['body'],`
			`'@links' => hash['links'].map { \|link\| URI(link) },`
			`'@code' => hash['code'].to_i,`
			`'@visited' => hash['visited'],`
			`'@depth' => hash['depth'].to_i,`
			`'@referer' => hash['referer'],`
			`'@redirect_to' => URI(hash['redirect_to']),`
			`'@response_time' => hash['response_time'].to_i,`
			`'@fetched' => hash['fetched']`
			`}.each do \|var, value\|`
			`page.instance_variable_set(var, value)`
			`end`
			`page`
			`end`
Updated support for Web modules and analysis techniques (committing to new clean branch due to corruption) 2012-11-01 19:14:38 +00:00
			`def dup`
			`Marshal.load( Marshal.dump( self ) )`
			`end`

Initial import of an Anemone snapshot git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da 2010-11-06 04:34:43 +00:00			`end`
			`end`