Initial import of an Anemone snapshot

git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da
2010-11-06 04:34:43 +00:00 · 2010-11-06 04:34:43 +00:00 · b3cc6e19b6
parent f6415186a8
commit b3cc6e19b6
28 changed files with 1712 additions and 0 deletions
--- a/lib/anemone.rb
+++ b/lib/anemone.rb
@ -0,0 +1,2 @@
+require 'rubygems'
+require 'anemone/core'
--- a/lib/anemone/cli.rb
+++ b/lib/anemone/cli.rb
@ -0,0 +1,24 @@
+module Anemone
+  module CLI
+    COMMANDS = %w[count cron pagedepth serialize url-list]
+    
+    def self.run
+      command = ARGV.shift
+      
+      if COMMANDS.include? command
+        load "anemone/cli/#{command.tr('-', '_')}.rb"
+      else
+        puts <<-INFO
+Anemone is a web spider framework that can collect
+useful information about pages it visits.
+
+Usage:
+  anemone <command> [arguments]
+
+Commands:
+  #{COMMANDS.join(', ')}
+INFO
+      end
+    end
+  end
+end
--- a/lib/anemone/cli/count.rb
+++ b/lib/anemone/cli/count.rb
@ -0,0 +1,22 @@
+require 'anemone'
+
+begin
+  # make sure that the first option is a URL we can crawl
+  url = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone count <url>
+
+Synopsis:
+  Crawls a site starting at the given URL and outputs the total number
+  of unique pages on the site.
+INFO
+  exit(0)
+end
+
+Anemone.crawl(url) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq!.size
+  end
+end
--- a/lib/anemone/cli/cron.rb
+++ b/lib/anemone/cli/cron.rb
@ -0,0 +1,90 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+
+begin
+  # make sure that the last argument is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone cron [options] <url>
+
+Synopsis:
+  Combination of `count`, `pagedepth` and `url-list` commands.
+  Performs pagedepth, url list, and count functionality.
+  Outputs results to STDOUT and link list to file (urls.txt).
+  Meant to be run daily as a cron job.
+
+Options:
+  -r, --relative           Output relative URLs (rather than absolute)
+  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
+INFO
+  exit(0)
+end
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+
+Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|  
+  
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    unless not_found.empty?
+      puts "\n404's:"
+
+      missing_links = pages.urls_linking_to(not_found)
+      missing_links.each do |url, links|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        links.slice(0..10).each do |u|
+          u = u.path if options.relative
+          puts "  linked from #{u}"
+        end
+        
+        puts " ..." if links.size > 10
+      end
+
+      print "\n"
+    end  
+    
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
+  
+end
--- a/lib/anemone/cli/pagedepth.rb
+++ b/lib/anemone/cli/pagedepth.rb
@ -0,0 +1,32 @@
+require 'anemone'
+
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone pagedepth <url>
+
+Synopsis:
+  Crawls a site starting at the given URL and outputs a count of
+  the number of pages at each depth of the crawl.
+INFO
+  exit(0)
+end
+
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq!
+
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end
--- a/lib/anemone/cli/serialize.rb
+++ b/lib/anemone/cli/serialize.rb
@ -0,0 +1,35 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+
+begin
+  # make sure that the first option is a URL we can crawl
+  root = URI(ARGV[0])
+rescue
+  puts <<-INFO
+Usage:
+  anemone serialize [options] <url>
+
+Synopsis:
+  Crawls a site starting at the given URL and saves the resulting
+  PageStore object to a file using Marshal serialization.
+
+Options:
+  -o, --output filename      Filename to save PageStore to. Defaults to crawl.{Time.now}
+INFO
+  exit(0)
+end
+
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end
--- a/lib/anemone/cli/url_list.rb
+++ b/lib/anemone/cli/url_list.rb
@ -0,0 +1,41 @@
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+
+options = OpenStruct.new
+options.relative = false
+
+begin
+  # make sure that the last option is a URL we can crawl
+  root = URI(ARGV.last)
+rescue
+  puts <<-INFO
+Usage:
+  anemone url-list [options] <url>
+    
+Synopsis:
+  Crawls a site starting at the given URL, and outputs the URL of each page
+  in the domain as they are encountered.
+
+Options:
+  -r, --relative      Output relative URLs (rather than absolute)
+INFO
+  exit(0)
+end
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+
+Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
+  
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+  
+end
--- a/lib/anemone/cookie_store.rb
+++ b/lib/anemone/cookie_store.rb
@ -0,0 +1,35 @@
+require 'delegate'
+require 'webrick/cookie'
+
+class WEBrick::Cookie
+  def expired?
+    !!expires && expires < Time.now
+  end
+end
+
+module Anemone
+  class CookieStore < DelegateClass(Hash)
+
+    def initialize(cookies = nil)
+      @cookies = {}
+      cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
+      super(@cookies)
+    end
+
+    def merge!(set_cookie_str)
+      begin
+        cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
+          hash[cookie.name] = cookie if !!cookie
+          hash
+        end
+        @cookies.merge! cookie_hash
+      rescue
+      end
+    end
+
+    def to_s
+      @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
+    end
+
+  end
+end
--- a/lib/anemone/core.rb
+++ b/lib/anemone/core.rb
@ -0,0 +1,294 @@
+require 'thread'
+require 'robots'
+require 'anemone/tentacle'
+require 'anemone/page'
+require 'anemone/exceptions'
+require 'anemone/page_store'
+require 'anemone/storage'
+require 'anemone/storage/base'
+
+module Anemone
+
+  VERSION = '0.5.0';
+
+  #
+  # Convenience method to start a crawl
+  #
+  def Anemone.crawl(urls, options = {}, &block)
+    Core.crawl(urls, options, &block)
+  end
+
+  class Core
+
+    # PageStore storing all Page objects encountered during the crawl
+    attr_reader :pages
+    # Hash of options for the crawl
+    attr_reader :opts
+
+    DEFAULT_OPTS = {
+      # run 4 Tentacle threads to fetch pages
+      :threads => 4,
+      # disable verbose output
+      :verbose => false,
+      # don't throw away the page response body after scanning it for links
+      :discard_page_bodies => false,
+      # identify self as Anemone/VERSION
+      :user_agent => "Anemone/#{Anemone::VERSION}",
+      # no delay between requests
+      :delay => 0,
+      # don't obey the robots exclusion protocol
+      :obey_robots_txt => false,
+      # by default, don't limit the depth of the crawl
+      :depth_limit => false,
+      # number of times HTTP redirects will be followed
+      :redirect_limit => 5,
+      # storage engine defaults to Hash in +process_options+ if none specified
+      :storage => nil,
+      # Hash of cookie name => value to send with HTTP requests
+      :cookies => nil,
+      # accept cookies from the server and send them back?
+      :accept_cookies => false,
+      # skip any link with a query string? e.g. http://foo.com/?u=user
+      :skip_query_strings => false
+    }
+
+    # Create setter methods for all options to be called from the crawl block
+    DEFAULT_OPTS.keys.each do |key|
+      define_method "#{key}=" do |value|
+        @opts[key.to_sym] = value
+      end
+    end
+
+    #
+    # Initialize the crawl with starting *urls* (single URL or Array of URLs)
+    # and optional *block*
+    #
+    def initialize(urls, opts = {})
+      @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
+      @urls.each{ |url| url.path = '/' if url.path.empty? }
+
+      @tentacles = []
+      @on_every_page_blocks = []
+      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+      @skip_link_patterns = []
+      @after_crawl_blocks = []
+      @opts = opts
+
+      yield self if block_given?
+    end
+
+    #
+    # Convenience method to start a new crawl
+    #
+    def self.crawl(urls, opts = {})
+      self.new(urls, opts) do |core|
+        yield core if block_given?
+        core.run
+      end
+    end
+
+    #
+    # Add a block to be executed on the PageStore after the crawl
+    # is finished
+    #
+    def after_crawl(&block)
+      @after_crawl_blocks << block
+      self
+    end
+
+    #
+    # Add one ore more Regex patterns for URLs which should not be
+    # followed
+    #
+    def skip_links_like(*patterns)
+      @skip_link_patterns.concat [patterns].flatten.compact
+      self
+    end
+
+    #
+    # Add a block to be executed on every Page as they are encountered
+    # during the crawl
+    #
+    def on_every_page(&block)
+      @on_every_page_blocks << block
+      self
+    end
+
+    #
+    # Add a block to be executed on Page objects with a URL matching
+    # one or more patterns
+    #
+    def on_pages_like(*patterns, &block)
+      if patterns
+        patterns.each do |pattern|
+          @on_pages_like_blocks[pattern] << block
+        end
+      end
+      self
+    end
+
+    #
+    # Specify a block which will select which links to follow on each page.
+    # The block should return an Array of URI objects.
+    #
+    def focus_crawl(&block)
+      @focus_crawl_block = block
+      self
+    end
+
+    #
+    # Perform the crawl
+    #
+    def run
+      process_options
+
+      @urls.delete_if { |url| !visit_link?(url) }
+      return if @urls.empty?
+
+      link_queue = Queue.new
+      page_queue = Queue.new
+
+      @opts[:threads].times do
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
+      end
+
+      @urls.each{ |url| link_queue.enq(url) }
+
+      loop do
+        page = page_queue.deq
+        @pages.touch_key page.url
+        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
+        do_page_blocks page
+        page.discard_doc! if @opts[:discard_page_bodies]
+
+        links = links_to_follow page
+        links.each do |link|
+          link_queue << [link, page.url.dup, page.depth + 1]
+        end
+        @pages.touch_keys links
+
+        @pages[page.url] = page
+
+        # if we are done with the crawl, tell the threads to end
+        if link_queue.empty? and page_queue.empty?
+          until link_queue.num_waiting == @tentacles.size
+            Thread.pass
+          end
+          if page_queue.empty?
+            @tentacles.size.times { link_queue << :END }
+            break
+          end
+        end
+      end
+
+      @tentacles.each { |thread| thread.join }
+      do_after_crawl_blocks
+      self
+    end
+
+    private
+
+    def process_options
+      @opts = DEFAULT_OPTS.merge @opts
+      @opts[:threads] = 1 if @opts[:delay] > 0
+      storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
+      @pages = PageStore.new(storage)
+      @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+
+      freeze_options
+    end
+
+    #
+    # Freeze the opts Hash so that no options can be modified
+    # once the crawl begins
+    #
+    def freeze_options
+      @opts.freeze
+      @opts.each_key { |key| @opts[key].freeze }
+      @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
+    end
+
+    #
+    # Execute the after_crawl blocks
+    #
+    def do_after_crawl_blocks
+      @after_crawl_blocks.each { |block| block.call(@pages) }
+    end
+
+    #
+    # Execute the on_every_page blocks for *page*
+    #
+    def do_page_blocks(page)
+      @on_every_page_blocks.each do |block|
+        block.call(page)
+      end
+
+      @on_pages_like_blocks.each do |pattern, blocks|
+        blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
+      end
+    end
+
+    #
+    # Return an Array of links to follow from the given page.
+    # Based on whether or not the link has already been crawled,
+    # and the block given to focus_crawl()
+    #
+    def links_to_follow(page)
+      links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
+      links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
+    end
+
+    #
+    # Returns +true+ if *link* has not been visited already,
+    # and is not excluded by a skip_link pattern...
+    # and is not excluded by robots.txt...
+    # and is not deeper than the depth limit
+    # Returns +false+ otherwise.
+    #
+    def visit_link?(link, from_page = nil)
+      !@pages.has_page?(link) &&
+      !skip_link?(link) &&
+      !skip_query_string?(link) &&
+      allowed(link) &&
+      !too_deep?(from_page)
+    end
+
+    #
+    # Returns +true+ if we are obeying robots.txt and the link
+    # is granted access in it. Always returns +true+ when we are
+    # not obeying robots.txt.
+    #
+    def allowed(link)
+      @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+    end
+
+    #
+    # Returns +true+ if we are over the page depth limit.
+    # This only works when coming from a page and with the +depth_limit+ option set.
+    # When neither is the case, will always return +false+.
+    def too_deep?(from_page)
+      if from_page && @opts[:depth_limit]
+        from_page.depth >= @opts[:depth_limit]
+      else
+        false
+      end
+    end
+    
+    #
+    # Returns +true+ if *link* should not be visited because
+    # it has a query string and +skip_query_strings+ is true.
+    #
+    def skip_query_string?(link)
+      @opts[:skip_query_strings] && link.query
+    end
+
+    #
+    # Returns +true+ if *link* should not be visited because
+    # its URL matches a skip_link pattern.
+    #
+    def skip_link?(link)
+      @skip_link_patterns.any? { |pattern| link.path =~ pattern }
+    end
+
+  end
+end
--- a/lib/anemone/docs/CHANGELOG.rdoc
+++ b/lib/anemone/docs/CHANGELOG.rdoc
@ -0,0 +1,75 @@
+== 0.5.0 / 2010-09-01
+
+* Major enhancements
+
+  * Added page storage engines for MongoDB and Redis
+
+* Minor enhancements
+
+  * Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
+  * Added skip_query_strings option to skip links with query strings (Joost Baaij)
+
+* Bug fixes
+
+  * Only consider status code 300..307 a redirect (Marc Seeger)
+  * Canonicalize redirect links (Marc Seeger)
+
+== 0.4.0 / 2010-04-08
+
+* Major enchancements
+
+  * Cookies can be accepted and sent with each HTTP request.
+
+== 0.3.2 / 2010-02-04
+
+* Bug fixes
+
+  * Fixed issue that allowed following redirects off the original domain
+
+== 0.3.1 / 2010-01-22
+
+* Minor enhancements
+
+  * Added an attr_accessor to Page for the HTTP response body
+
+* Bug fixes
+
+  * Fixed incorrect method calls in CLI scripts
+
+== 0.3.0 / 2009-12-15
+
+* Major enchancements
+
+  * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
+
+* Minor enhancements
+
+  * Options can be set via methods on the Core object in the crawl block
+
+== 0.2.3 / 2009-11-01
+
+* Minor enhancements
+
+  * Options are now applied per-crawl, rather than module-wide.
+
+* Bug fixes
+
+  * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
+
+== 0.2.2 / 2009-10-26
+
+* Minor enhancements
+
+  * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
+
+== 0.2.1 / 2009-10-24
+
+* Major enhancements
+
+  * Added HTTPS support.
+  * CLI program 'anemone', which is a frontend for several tasks.
+
+* Minor enhancements
+
+  * HTTP request response time recorded in Page.
+  * Use of persistent HTTP connections.
--- a/lib/anemone/docs/CONTRIBUTORS
+++ b/lib/anemone/docs/CONTRIBUTORS
@ -0,0 +1,4 @@
+Many thanks to the following folks who have contributed code to Anemone. In no particular order:
+
+Marc Seeger
+Joost Baaij
--- a/lib/anemone/docs/LICENSE.txt
+++ b/lib/anemone/docs/LICENSE.txt
@ -0,0 +1,19 @@
+Copyright (c) 2009 Vertive, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/lib/anemone/docs/MODIFIED.txt
+++ b/lib/anemone/docs/MODIFIED.txt
@ -0,0 +1,3 @@
+This snapshot of Anemone has been modified for use within the Metasploit Framework
+
+ * None yet, this is the initial import
--- a/lib/anemone/docs/README.rdoc
+++ b/lib/anemone/docs/README.rdoc
@ -0,0 +1,36 @@
+= Anemone
+
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+
+See http://anemone.rubyforge.org for more information.
+
+== Features
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+* Choose the links to follow on each page with focus_crawl()
+* HTTPS support
+* Records response time for each page
+* CLI program can list all pages in a domain, calculate page depths, and more
+* Obey robots.txt
+* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
+
+== Examples
+See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
+
+== Requirements
+* nokogiri
+* robots
+
+== Development
+To test and develop this gem, additional requirements are:
+* rspec
+* fakeweb
+* tokyocabinet
+* mongo
+* redis
+
+You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
--- a/lib/anemone/docs/Rakefile
+++ b/lib/anemone/docs/Rakefile
@ -0,0 +1,26 @@
+require 'rubygems'
+require 'rake'
+
+require 'spec/rake/spectask'
+Spec::Rake::SpecTask.new(:spec) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.spec_files = FileList['spec/**/*_spec.rb']
+end
+
+Spec::Rake::SpecTask.new(:rcov) do |spec|
+  spec.libs << 'lib' << 'spec'
+  spec.pattern = 'spec/**/*_spec.rb'
+  spec.rcov = true
+end
+
+task :default => :spec
+
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "anemone #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
--- a/lib/anemone/docs/VERSION
+++ b/lib/anemone/docs/VERSION
@ -0,0 +1 @@
+0.5.0
--- a/lib/anemone/exceptions.rb
+++ b/lib/anemone/exceptions.rb
@ -0,0 +1,5 @@
+module Anemone
+  class Error < ::StandardError
+    attr_accessor :wrapped_exception
+  end
+end
--- a/lib/anemone/http.rb
+++ b/lib/anemone/http.rb
@ -0,0 +1,157 @@
+require 'net/https'
+require 'anemone/page'
+require 'anemone/cookie_store'
+
+module Anemone
+  class HTTP
+    # Maximum number of redirects to follow on each get_response
+    REDIRECT_LIMIT = 5
+
+    # CookieStore for this HTTP client
+    attr_reader :cookie_store
+
+    def initialize(opts = {})
+      @connections = {}
+      @opts = opts
+      @cookie_store = CookieStore.new(@opts[:cookies])
+    end
+
+    #
+    # Fetch a single Page from the response of an HTTP request to *url*.
+    # Just gets the final destination page.
+    #
+    def fetch_page(url, referer = nil, depth = nil)
+      fetch_pages(url, referer, depth).last
+    end
+
+    #
+    # Create new Pages from the response of an HTTP request to *url*,
+    # including redirects
+    #
+    def fetch_pages(url, referer = nil, depth = nil)
+      begin
+        url = URI(url) unless url.is_a?(URI)
+        pages = []
+        get(url, referer) do |response, code, location, redirect_to, response_time|
+          pages << Page.new(location, :body => response.body.dup,
+                                      :code => code,
+                                      :headers => response.to_hash,
+                                      :referer => referer,
+                                      :depth => depth,
+                                      :redirect_to => redirect_to,
+                                      :response_time => response_time)
+        end
+
+        return pages
+      rescue => e
+        if verbose?
+          puts e.inspect
+          puts e.backtrace
+        end
+        return [Page.new(url, :error => e)]
+      end
+    end
+
+    #
+    # The maximum number of redirects to follow
+    #
+    def redirect_limit
+      @opts[:redirect_limit] || REDIRECT_LIMIT
+    end
+
+    #
+    # The user-agent string which will be sent with each request,
+    # or nil if no such option is set
+    #
+    def user_agent
+      @opts[:user_agent]
+    end
+
+    #
+    # Does this HTTP client accept cookies from the server?
+    #
+    def accept_cookies?
+      @opts[:accept_cookies]
+    end
+
+    private
+
+    #
+    # Retrieve HTTP responses for *url*, including redirects.
+    # Yields the response object, response code, and URI location
+    # for each response.
+    #
+    def get(url, referer = nil)
+      limit = redirect_limit
+      loc = url
+      begin
+          # if redirected to a relative url, merge it with the host of the original
+          # request url
+          loc = url.merge(loc) if loc.relative?
+
+          response, response_time = get_response(loc, referer)
+          code = Integer(response.code)
+          redirect_to = response.is_a?(Net::HTTPRedirection) ?  URI(response['location']).normalize : nil
+          yield response, code, loc, redirect_to, response_time
+          limit -= 1
+      end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
+    end
+
+    #
+    # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+    #
+    def get_response(url, referer = nil)
+      full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
+
+      opts = {}
+      opts['User-Agent'] = user_agent if user_agent
+      opts['Referer'] = referer.to_s if referer
+      opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
+
+      retries = 0
+      begin
+        start = Time.now()
+        response = connection(url).get(full_path, opts)
+        finish = Time.now()
+        response_time = ((finish - start) * 1000).round
+        @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
+        return response, response_time
+      rescue EOFError
+        refresh_connection(url)
+        retries += 1
+        retry unless retries > 3
+      end
+    end
+
+    def connection(url)
+      @connections[url.host] ||= {}
+
+      if conn = @connections[url.host][url.port]
+        return conn
+      end
+
+      refresh_connection url
+    end
+
+    def refresh_connection(url)
+      http = Net::HTTP.new(url.host, url.port)
+      if url.scheme == 'https'
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      end
+      @connections[url.host][url.port] = http.start
+    end
+
+    def verbose?
+      @opts[:verbose]
+    end
+
+    #
+    # Allowed to connect to the requested url?
+    #
+    def allowed?(to_url, from_url)
+      to_url.host.nil? || (to_url.host == from_url.host)
+    end
+
+  end
+end
--- a/lib/anemone/page.rb
+++ b/lib/anemone/page.rb
@ -0,0 +1,202 @@
+require 'nokogiri'
+require 'ostruct'
+require 'webrick/cookie'
+
+module Anemone
+  class Page
+
+    # The URL of the page
+    attr_reader :url
+    # The raw HTTP response body of the page
+    attr_reader :body
+    # Headers of the HTTP response
+    attr_reader :headers
+    # URL of the page this one redirected to, if any
+    attr_reader :redirect_to
+    # Exception object, if one was raised during HTTP#fetch_page
+    attr_reader :error
+
+    # OpenStruct for user-stored data
+    attr_accessor :data
+    # Integer response code of the page
+    attr_accessor :code
+    # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
+    attr_accessor :visited
+    # Depth of this page from the root of the crawl. This is not necessarily the
+    # shortest path; use PageStore#shortest_paths! to find that value.
+    attr_accessor :depth
+    # URL of the page that brought us to this page
+    attr_accessor :referer
+    # Response time of the request for this page in milliseconds
+    attr_accessor :response_time
+
+    #
+    # Create a new page
+    #
+    def initialize(url, params = {})
+      @url = url
+      @data = OpenStruct.new
+
+      @code = params[:code]
+      @headers = params[:headers] || {}
+      @headers['content-type'] ||= ['']
+      @aliases = Array(params[:aka]).compact
+      @referer = params[:referer]
+      @depth = params[:depth] || 0
+      @redirect_to = to_absolute(params[:redirect_to])
+      @response_time = params[:response_time]
+      @body = params[:body]
+      @error = params[:error]
+
+      @fetched = !params[:code].nil?
+    end
+
+    #
+    # Array of distinct A tag HREFs from the page
+    #
+    def links
+      return @links unless @links.nil?
+      @links = []
+      return @links if !doc
+
+      doc.search("//a[@href]").each do |a|
+        u = a['href']
+        next if u.nil? or u.empty?
+        abs = to_absolute(URI(u)) rescue next
+        @links << abs if in_domain?(abs)
+      end
+      @links.uniq!
+      @links
+    end
+
+    #
+    # Nokogiri document for the HTML body
+    #
+    def doc
+      return @doc if @doc
+      @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
+    end
+
+    #
+    # Delete the Nokogiri document and response body to conserve memory
+    #
+    def discard_doc!
+      links # force parsing of page links before we trash the document
+      @doc = @body = nil
+    end
+
+    #
+    # Was the page successfully fetched?
+    # +true+ if the page was fetched with no error, +false+ otherwise.
+    #
+    def fetched?
+      @fetched
+    end
+
+    #
+    # Array of cookies received with this page as WEBrick::Cookie objects.
+    #
+    def cookies
+      WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
+    end
+
+    #
+    # The content-type returned by the HTTP request for this page
+    #
+    def content_type
+      headers['content-type'].first
+    end
+
+    #
+    # Returns +true+ if the page is a HTML document, returns +false+
+    # otherwise.
+    #
+    def html?
+      !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
+    end
+
+    #
+    # Returns +true+ if the page is a HTTP redirect, returns +false+
+    # otherwise.
+    #
+    def redirect?
+      (300..307).include?(@code)
+    end
+
+    #
+    # Returns +true+ if the page was not found (returned 404 code),
+    # returns +false+ otherwise.
+    #
+    def not_found?
+      404 == @code
+    end
+
+    #
+    # Converts relative URL *link* into an absolute URL based on the
+    # location of the page
+    #
+    def to_absolute(link)
+      return nil if link.nil?
+
+      # remove anchor
+      link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
+
+      relative = URI(link)
+      absolute = @url.merge(relative)
+
+      absolute.path = '/' if absolute.path.empty?
+
+      return absolute
+    end
+
+    #
+    # Returns +true+ if *uri* is in the same domain as the page, returns
+    # +false+ otherwise
+    #
+    def in_domain?(uri)
+      uri.host == @url.host
+    end
+
+    def marshal_dump
+      [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
+    end
+
+    def marshal_load(ary)
+      @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
+    end
+
+    def to_hash
+      {'url' => @url.to_s,
+       'headers' => Marshal.dump(@headers),
+       'data' => Marshal.dump(@data),
+       'body' => @body,
+       'links' => links.map(&:to_s), 
+       'code' => @code,
+       'visited' => @visited,
+       'depth' => @depth,
+       'referer' => @referer.to_s,
+       'redirect_to' => @redirect_to.to_s,
+       'response_time' => @response_time,
+       'fetched' => @fetched}
+    end
+
+    def self.from_hash(hash)
+      page = self.new(URI(hash['url']))
+      {'@headers' => Marshal.load(hash['headers']),
+       '@data' => Marshal.load(hash['data']),
+       '@body' => hash['body'],
+       '@links' => hash['links'].map { |link| URI(link) },
+       '@code' => hash['code'].to_i,
+       '@visited' => hash['visited'],
+       '@depth' => hash['depth'].to_i,
+       '@referer' => hash['referer'],
+       '@redirect_to' => URI(hash['redirect_to']),
+       '@response_time' => hash['response_time'].to_i,
+       '@fetched' => hash['fetched']
+      }.each do |var, value|
+        page.instance_variable_set(var, value)
+      end
+      page
+    end
+  end
+end
--- a/lib/anemone/page_store.rb
+++ b/lib/anemone/page_store.rb
@ -0,0 +1,160 @@
+require 'forwardable'
+
+module Anemone
+  class PageStore
+    extend Forwardable
+
+    def_delegators :@storage, :keys, :values, :size, :each
+
+    def initialize(storage = {})
+      @storage = storage
+    end
+
+    # We typically index the hash with a URI,
+    # but convert it to a String for easier retrieval
+    def [](index)
+      @storage[index.to_s]
+    end
+
+    def []=(index, other)
+      @storage[index.to_s] = other
+    end
+
+    def delete(key)
+      @storage.delete key.to_s
+    end
+
+    def has_key?(key)
+      @storage.has_key? key.to_s
+    end
+
+    def each_value
+      each { |key, value| yield value }
+    end
+
+    def values
+      result = []
+      each { |key, value| result << value }
+      result
+    end
+
+    def touch_key(key)
+      self[key] = Page.new(key)
+    end
+
+    def touch_keys(keys)
+      @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
+    end
+
+    # Does this PageStore contain the specified URL?
+    # HTTP and HTTPS versions of a URL are considered to be the same page.
+    def has_page?(url)
+      schemes = %w(http https)
+      if schemes.include? url.scheme
+        u = url.dup
+        return schemes.any? { |s| u.scheme = s; has_key?(u) }
+      end
+
+      has_key? url
+    end
+
+    #
+    # Use a breadth-first search to calculate the single-source
+    # shortest paths from *root* to all pages in the PageStore
+    #
+    def shortest_paths!(root)
+      root = URI(root) if root.is_a?(String)
+      raise "Root node not found" if !has_key?(root)
+
+      q = Queue.new
+
+      q.enq root
+      root_page = self[root]
+      root_page.depth = 0
+      root_page.visited = true
+      self[root] = root_page
+      while !q.empty?
+        page = self[q.deq]
+        page.links.each do |u|
+          begin
+            link = self[u]
+            next if link.nil? || !link.fetched? || link.visited
+
+            q << u unless link.redirect?
+            link.visited = true
+            link.depth = page.depth + 1
+            self[u] = link
+
+            if link.redirect?
+              u = link.redirect_to
+              redo
+            end
+          end
+        end
+      end
+
+      self
+    end
+
+    #
+    # Removes all Pages from storage where redirect? is true
+    #
+    def uniq!
+      each_value { |page| delete page.url if page.redirect? }
+      self
+    end
+
+    #
+    # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
+    # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
+    #
+    def pages_linking_to(urls)
+      unless urls.is_a?(Array)
+        urls = [urls]
+        single = true
+      end
+
+      urls.map! do |url|
+        unless url.is_a?(URI)
+          URI(url) rescue nil
+        else
+          url
+        end
+      end
+      urls.compact
+
+      links = {}
+      urls.each { |url| links[url] = [] }
+      values.each do |page|
+        urls.each { |url| links[url] << page if page.links.include?(url) }
+      end
+
+      if single and !links.empty?
+        return links[urls.first]
+      else
+        return links
+      end
+    end
+
+    #
+    # If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
+    # If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
+    #
+    def urls_linking_to(urls)
+      unless urls.is_a?(Array)
+        urls = [urls] unless urls.is_a?(Array)
+        single = true
+      end
+
+      links = pages_linking_to(urls)
+      links.each { |url, pages| links[url] = pages.map{|p| p.url} }
+
+      if single and !links.empty?
+        return links[urls.first]
+      else
+        return links
+      end
+    end
+
+  end
+end
--- a/lib/anemone/storage.rb
+++ b/lib/anemone/storage.rb
@ -0,0 +1,34 @@
+module Anemone
+  module Storage
+
+    def self.Hash(*args)
+      hash = Hash.new(*args)
+      # add close method for compatibility with Storage::Base
+      class << hash; def close; end; end
+      hash
+    end
+
+    def self.PStore(*args)
+      require 'anemone/storage/pstore'
+      self::PStore.new(*args)
+    end
+
+    def self.TokyoCabinet(file = 'anemone.tch')
+      require 'anemone/storage/tokyo_cabinet'
+      self::TokyoCabinet.new(file)
+    end
+
+    def self.MongoDB(mongo_db = nil, collection_name = 'pages')
+      require 'anemone/storage/mongodb'
+      mongo_db ||= Mongo::Connection.new.db('anemone')
+      raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
+      self::MongoDB.new(mongo_db, collection_name)
+    end
+
+    def self.Redis(opts = {})
+      require 'anemone/storage/redis'
+      self::Redis.new(opts)
+    end
+
+  end
+end
--- a/lib/anemone/storage/base.rb
+++ b/lib/anemone/storage/base.rb
@ -0,0 +1,75 @@
+require 'anemone/storage/exceptions'
+
+module Anemone
+  module Storage
+    class Base
+
+      def initialize(adapter)
+        @adap = adapter
+
+        # verify adapter conforms to this class's methods
+        methods.each do |method|
+          if !@adap.respond_to?(method.to_sym)
+            raise "Storage adapter must support method #{method}"
+          end
+        end
+      end
+
+      def [](key)
+        @adap[key]
+        rescue
+          puts key
+          raise RetrievalError, $! 
+      end
+
+      def []=(key, value)
+        @adap[key] = value
+        rescue
+          raise InsertionError, $!
+      end
+
+      def delete(key)
+        @adap.delete(key)
+        rescue
+          raise DeletionError, $!
+      end
+
+      def each
+        @adap.each { |k, v| yield k, v }
+        rescue
+          raise GenericError, $!
+      end
+
+      def merge!(hash)
+        @adap.merge!(hash)
+        rescue
+          raise GenericError, $!
+      end
+
+      def close
+        @adap.close
+        rescue
+          raise CloseError, $!
+      end
+
+      def size
+        @adap.size
+        rescue
+          raise GenericError, $!
+      end
+
+      def keys
+        @adap.keys
+        rescue
+          raise GenericError, $!
+      end
+
+      def has_key?(key)
+        @adap.has_key?(key)
+        rescue
+          raise GenericError, $!
+      end
+
+    end
+  end
+end
--- a/lib/anemone/storage/exceptions.rb
+++ b/lib/anemone/storage/exceptions.rb
@ -0,0 +1,15 @@
+module Anemone
+  module Storage
+
+    class GenericError < Error; end;
+
+    class ConnectionError < Error; end
+
+    class RetrievalError < Error; end
+
+    class InsertionError < Error; end
+
+    class CloseError < Error; end
+
+  end
+end
--- a/lib/anemone/storage/mongodb.rb
+++ b/lib/anemone/storage/mongodb.rb
@ -0,0 +1,89 @@
+begin
+  require 'mongo'
+rescue LoadError
+  puts "You need the mongo gem to use Anemone::Storage::MongoDB"
+  exit
+end
+
+module Anemone
+  module Storage
+    class MongoDB 
+
+      BINARY_FIELDS = %w(body headers data)
+
+      def initialize(mongo_db, collection_name)
+        @db = mongo_db
+        @collection = @db[collection_name]
+        @collection.remove
+        @collection.create_index 'url'
+      end
+
+      def [](url)
+        if value = @collection.find_one('url' => url.to_s)
+          load_page(value)
+        end
+      end
+
+      def []=(url, page)
+        hash = page.to_hash
+        BINARY_FIELDS.each do |field|
+          hash[field] = BSON::Binary.new(hash[field]) unless hash[field].nil?
+        end
+        @collection.update(
+          {'url' => page.url.to_s},
+          hash,
+          :upsert => true
+        )
+      end
+
+      def delete(url)
+        page = self[url]
+        @collection.remove('url' => url.to_s)
+        page
+      end
+
+      def each
+        @collection.find do |cursor|
+          cursor.each do |doc|
+            page = load_page(doc)
+            yield page.url.to_s, page 
+          end
+        end
+      end
+
+      def merge!(hash)
+        hash.each { |key, value| self[key] = value }
+        self
+      end
+
+      def size
+        @collection.count
+      end
+
+      def keys
+        keys = []
+        self.each { |k, v| keys << k.to_s }
+        keys
+      end
+
+      def has_key?(url)
+        !!@collection.find_one('url' => url.to_s)
+      end
+
+      def close
+        @db.connection.close
+      end
+
+      private
+
+      def load_page(hash)
+        BINARY_FIELDS.each do |field|
+          hash[field] = hash[field].to_s
+        end
+        Page.from_hash(hash)
+      end
+
+    end
+  end
+end
+
--- a/lib/anemone/storage/pstore.rb
+++ b/lib/anemone/storage/pstore.rb
@ -0,0 +1,50 @@
+require 'pstore'
+require 'forwardable'
+
+module Anemone
+  module Storage
+    class PStore
+      extend Forwardable
+
+      def_delegators :@keys, :has_key?, :keys, :size
+
+      def initialize(file)
+        File.delete(file) if File.exists?(file)
+        @store = ::PStore.new(file)
+        @keys = {}
+      end
+
+      def [](key)
+        @store.transaction { |s| s[key] }
+      end
+
+      def []=(key,value)
+        @keys[key] = nil
+        @store.transaction { |s| s[key] = value }
+      end
+
+      def delete(key)
+        @keys.delete(key)
+        @store.transaction { |s| s.delete key}
+      end
+
+      def each
+        @keys.each_key do |key|
+          value = nil
+          @store.transaction { |s| value = s[key] }
+          yield key, value
+        end
+      end
+
+      def merge!(hash)
+        @store.transaction do |s|
+          hash.each { |key, value| s[key] = value; @keys[key] = nil }
+        end
+        self
+      end
+
+      def close; end
+
+    end
+  end
+end
--- a/lib/anemone/storage/redis.rb
+++ b/lib/anemone/storage/redis.rb
@ -0,0 +1,90 @@
+require 'redis'
+
+module Anemone
+  module Storage
+    class Redis
+
+      MARSHAL_FIELDS = %w(links visited fetched)
+
+      def initialize(opts = {})
+        @redis = ::Redis.new(opts)
+        @key_prefix = opts[:key_prefix] || 'anemone'
+        keys.each { |key| delete(key) }
+      end
+
+      def [](key)
+        rkey = "#{@key_prefix}:pages:#{key.to_s}"
+        rget(rkey)
+      end
+
+      def []=(key, value)
+        rkey = "#{@key_prefix}:pages:#{key.to_s}"
+        hash = value.to_hash
+        MARSHAL_FIELDS.each do |field|
+          hash[field] = Marshal.dump(hash[field])
+        end
+        hash.each do |field, value|
+          @redis.hset(rkey, field, value)
+        end
+      end
+
+      def delete(key)
+        rkey = "#{@key_prefix}:pages:#{key.to_s}"
+        page = self[key]
+        @redis.del(rkey)
+        page
+      end
+
+      def each
+        rkeys = @redis.keys("#{@key_prefix}:pages:*")
+        rkeys.each do |rkey|
+          page = rget(rkey)
+          yield page.url.to_s, page
+        end
+      end
+
+      def merge!(hash)
+        hash.each { |key, value| self[key] = value }
+        self
+      end
+
+      def size
+        @redis.keys("#{@key_prefix}:pages:*").size
+      end
+
+      def keys
+        keys = []
+        self.each { |k, v| keys << k.to_s }
+        keys
+      end
+
+      def has_key?(key)
+        rkey = "#{@key_prefix}:pages:#{key.to_s}"
+        @redis.exists(rkey)
+      end
+
+      def close
+        @redis.quit
+      end
+
+      private
+
+      def load_value(hash)
+        MARSHAL_FIELDS.each do |field|
+          unless hash[field].nil? || hash[field] == ''
+            hash[field] = Marshal.load(hash[field]) 
+          end
+        end
+        Page.from_hash(hash)
+      end
+
+      def rget(rkey)
+        hash = @redis.hgetall(rkey)
+        if !!hash
+          load_value(hash)
+        end
+      end
+
+    end
+  end
+end
--- a/lib/anemone/storage/tokyo_cabinet.rb
+++ b/lib/anemone/storage/tokyo_cabinet.rb
@ -0,0 +1,57 @@
+begin
+  require 'tokyocabinet'
+rescue LoadError
+  puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
+  exit
+end
+
+require 'forwardable'
+
+module Anemone
+  module Storage
+    class TokyoCabinet
+      extend Forwardable
+
+      def_delegators :@db, :close, :size, :keys, :has_key?
+
+      def initialize(file)
+        raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
+        @db = ::TokyoCabinet::HDB::new
+        @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
+        @db.clear
+      end
+
+      def [](key)
+        if value = @db[key]
+          load_value(value)
+        end
+      end
+
+      def []=(key, value)
+        @db[key] = [Marshal.dump(value)].pack("m")
+      end
+
+      def delete(key)
+        value = self[key]
+        @db.delete(key)
+        value
+      end
+
+      def each
+        @db.each { |k, v| yield k, load_value(v) }
+      end
+
+      def merge!(hash)
+        hash.each { |key, value| self[key] = value }
+        self
+      end
+
+      private
+
+      def load_value(value)
+        Marshal.load(value.unpack("m")[0])
+      end
+
+    end
+  end
+end
--- a/lib/anemone/tentacle.rb
+++ b/lib/anemone/tentacle.rb
@ -0,0 +1,39 @@
+require 'anemone/http'
+
+module Anemone
+  class Tentacle
+
+    #
+    # Create a new Tentacle
+    #
+    def initialize(link_queue, page_queue, opts = {})
+      @link_queue = link_queue
+      @page_queue = page_queue
+      @http = Anemone::HTTP.new(opts)
+      @opts = opts
+    end
+
+    #
+    # Gets links from @link_queue, and returns the fetched
+    # Page objects into @page_queue
+    #
+    def run
+      loop do
+        link, referer, depth = @link_queue.deq
+
+        break if link == :END
+
+        @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
+
+        delay
+      end
+    end
+
+    private
+
+    def delay
+      sleep @opts[:delay] if @opts[:delay] > 0
+    end
+
+  end
+end