Initial import of an Anemone snapshot

git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650da
unstable
HD Moore 2010-11-06 04:34:43 +00:00
parent f6415186a8
commit b3cc6e19b6
28 changed files with 1712 additions and 0 deletions

2
lib/anemone.rb Normal file
View File

@ -0,0 +1,2 @@
require 'rubygems'
require 'anemone/core'

24
lib/anemone/cli.rb Normal file
View File

@ -0,0 +1,24 @@
module Anemone
module CLI
COMMANDS = %w[count cron pagedepth serialize url-list]
def self.run
command = ARGV.shift
if COMMANDS.include? command
load "anemone/cli/#{command.tr('-', '_')}.rb"
else
puts <<-INFO
Anemone is a web spider framework that can collect
useful information about pages it visits.
Usage:
anemone <command> [arguments]
Commands:
#{COMMANDS.join(', ')}
INFO
end
end
end
end

22
lib/anemone/cli/count.rb Normal file
View File

@ -0,0 +1,22 @@
require 'anemone'
begin
# make sure that the first option is a URL we can crawl
url = URI(ARGV[0])
rescue
puts <<-INFO
Usage:
anemone count <url>
Synopsis:
Crawls a site starting at the given URL and outputs the total number
of unique pages on the site.
INFO
exit(0)
end
Anemone.crawl(url) do |anemone|
anemone.after_crawl do |pages|
puts pages.uniq!.size
end
end

90
lib/anemone/cli/cron.rb Normal file
View File

@ -0,0 +1,90 @@
require 'anemone'
require 'optparse'
require 'ostruct'
options = OpenStruct.new
options.relative = false
options.output_file = 'urls.txt'
begin
# make sure that the last argument is a URL we can crawl
root = URI(ARGV.last)
rescue
puts <<-INFO
Usage:
anemone cron [options] <url>
Synopsis:
Combination of `count`, `pagedepth` and `url-list` commands.
Performs pagedepth, url list, and count functionality.
Outputs results to STDOUT and link list to file (urls.txt).
Meant to be run daily as a cron job.
Options:
-r, --relative Output relative URLs (rather than absolute)
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
INFO
exit(0)
end
# parse command-line options
opts = OptionParser.new
opts.on('-r', '--relative') { options.relative = true }
opts.on('-o', '--output filename') {|o| options.output_file = o }
opts.parse!(ARGV)
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
anemone.after_crawl do |pages|
puts "Crawl results for #{root}\n"
# print a list of 404's
not_found = []
pages.each_value do |page|
url = page.url.to_s
not_found << url if page.not_found?
end
unless not_found.empty?
puts "\n404's:"
missing_links = pages.urls_linking_to(not_found)
missing_links.each do |url, links|
if options.relative
puts URI(url).path.to_s
else
puts url
end
links.slice(0..10).each do |u|
u = u.path if options.relative
puts " linked from #{u}"
end
puts " ..." if links.size > 10
end
print "\n"
end
# remove redirect aliases, and calculate pagedepths
pages = pages.shortest_paths!(root).uniq
depths = pages.values.inject({}) do |depths, page|
depths[page.depth] ||= 0
depths[page.depth] += 1
depths
end
# print the page count
puts "Total pages: #{pages.size}\n"
# print a list of depths
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
# output a list of urls to file
file = open(options.output_file, 'w')
pages.each_key do |url|
url = options.relative ? url.path.to_s : url.to_s
file.puts url
end
end
end

View File

@ -0,0 +1,32 @@
require 'anemone'
begin
# make sure that the first option is a URL we can crawl
root = URI(ARGV[0])
rescue
puts <<-INFO
Usage:
anemone pagedepth <url>
Synopsis:
Crawls a site starting at the given URL and outputs a count of
the number of pages at each depth of the crawl.
INFO
exit(0)
end
Anemone.crawl(root) do |anemone|
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
anemone.after_crawl do |pages|
pages = pages.shortest_paths!(root).uniq!
depths = pages.values.inject({}) do |depths, page|
depths[page.depth] ||= 0
depths[page.depth] += 1
depths
end
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
end
end

View File

@ -0,0 +1,35 @@
require 'anemone'
require 'optparse'
require 'ostruct'
begin
# make sure that the first option is a URL we can crawl
root = URI(ARGV[0])
rescue
puts <<-INFO
Usage:
anemone serialize [options] <url>
Synopsis:
Crawls a site starting at the given URL and saves the resulting
PageStore object to a file using Marshal serialization.
Options:
-o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
INFO
exit(0)
end
options = OpenStruct.new
options.output_file = "crawl.#{Time.now.to_i}"
# parse command-line options
opts = OptionParser.new
opts.on('-o', '--output filename') {|o| options.output_file = o }
opts.parse!(ARGV)
Anemone.crawl(root) do |anemone|
anemone.after_crawl do |pages|
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
end
end

View File

@ -0,0 +1,41 @@
require 'anemone'
require 'optparse'
require 'ostruct'
options = OpenStruct.new
options.relative = false
begin
# make sure that the last option is a URL we can crawl
root = URI(ARGV.last)
rescue
puts <<-INFO
Usage:
anemone url-list [options] <url>
Synopsis:
Crawls a site starting at the given URL, and outputs the URL of each page
in the domain as they are encountered.
Options:
-r, --relative Output relative URLs (rather than absolute)
INFO
exit(0)
end
# parse command-line options
opts = OptionParser.new
opts.on('-r', '--relative') { options.relative = true }
opts.parse!(ARGV)
Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
anemone.on_every_page do |page|
if options.relative
puts page.url.path
else
puts page.url
end
end
end

View File

@ -0,0 +1,35 @@
require 'delegate'
require 'webrick/cookie'
class WEBrick::Cookie
def expired?
!!expires && expires < Time.now
end
end
module Anemone
class CookieStore < DelegateClass(Hash)
def initialize(cookies = nil)
@cookies = {}
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
super(@cookies)
end
def merge!(set_cookie_str)
begin
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
hash[cookie.name] = cookie if !!cookie
hash
end
@cookies.merge! cookie_hash
rescue
end
end
def to_s
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
end
end
end

294
lib/anemone/core.rb Normal file
View File

@ -0,0 +1,294 @@
require 'thread'
require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
require 'anemone/exceptions'
require 'anemone/page_store'
require 'anemone/storage'
require 'anemone/storage/base'
module Anemone
VERSION = '0.5.0';
#
# Convenience method to start a crawl
#
def Anemone.crawl(urls, options = {}, &block)
Core.crawl(urls, options, &block)
end
class Core
# PageStore storing all Page objects encountered during the crawl
attr_reader :pages
# Hash of options for the crawl
attr_reader :opts
DEFAULT_OPTS = {
# run 4 Tentacle threads to fetch pages
:threads => 4,
# disable verbose output
:verbose => false,
# don't throw away the page response body after scanning it for links
:discard_page_bodies => false,
# identify self as Anemone/VERSION
:user_agent => "Anemone/#{Anemone::VERSION}",
# no delay between requests
:delay => 0,
# don't obey the robots exclusion protocol
:obey_robots_txt => false,
# by default, don't limit the depth of the crawl
:depth_limit => false,
# number of times HTTP redirects will be followed
:redirect_limit => 5,
# storage engine defaults to Hash in +process_options+ if none specified
:storage => nil,
# Hash of cookie name => value to send with HTTP requests
:cookies => nil,
# accept cookies from the server and send them back?
:accept_cookies => false,
# skip any link with a query string? e.g. http://foo.com/?u=user
:skip_query_strings => false
}
# Create setter methods for all options to be called from the crawl block
DEFAULT_OPTS.keys.each do |key|
define_method "#{key}=" do |value|
@opts[key.to_sym] = value
end
end
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
# and optional *block*
#
def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
@tentacles = []
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
@opts = opts
yield self if block_given?
end
#
# Convenience method to start a new crawl
#
def self.crawl(urls, opts = {})
self.new(urls, opts) do |core|
yield core if block_given?
core.run
end
end
#
# Add a block to be executed on the PageStore after the crawl
# is finished
#
def after_crawl(&block)
@after_crawl_blocks << block
self
end
#
# Add one ore more Regex patterns for URLs which should not be
# followed
#
def skip_links_like(*patterns)
@skip_link_patterns.concat [patterns].flatten.compact
self
end
#
# Add a block to be executed on every Page as they are encountered
# during the crawl
#
def on_every_page(&block)
@on_every_page_blocks << block
self
end
#
# Add a block to be executed on Page objects with a URL matching
# one or more patterns
#
def on_pages_like(*patterns, &block)
if patterns
patterns.each do |pattern|
@on_pages_like_blocks[pattern] << block
end
end
self
end
#
# Specify a block which will select which links to follow on each page.
# The block should return an Array of URI objects.
#
def focus_crawl(&block)
@focus_crawl_block = block
self
end
#
# Perform the crawl
#
def run
process_options
@urls.delete_if { |url| !visit_link?(url) }
return if @urls.empty?
link_queue = Queue.new
page_queue = Queue.new
@opts[:threads].times do
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
end
@urls.each{ |url| link_queue.enq(url) }
loop do
page = page_queue.deq
@pages.touch_key page.url
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
do_page_blocks page
page.discard_doc! if @opts[:discard_page_bodies]
links = links_to_follow page
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]
end
@pages.touch_keys links
@pages[page.url] = page
# if we are done with the crawl, tell the threads to end
if link_queue.empty? and page_queue.empty?
until link_queue.num_waiting == @tentacles.size
Thread.pass
end
if page_queue.empty?
@tentacles.size.times { link_queue << :END }
break
end
end
end
@tentacles.each { |thread| thread.join }
do_after_crawl_blocks
self
end
private
def process_options
@opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
@pages = PageStore.new(storage)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
freeze_options
end
#
# Freeze the opts Hash so that no options can be modified
# once the crawl begins
#
def freeze_options
@opts.freeze
@opts.each_key { |key| @opts[key].freeze }
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
end
#
# Execute the after_crawl blocks
#
def do_after_crawl_blocks
@after_crawl_blocks.each { |block| block.call(@pages) }
end
#
# Execute the on_every_page blocks for *page*
#
def do_page_blocks(page)
@on_every_page_blocks.each do |block|
block.call(page)
end
@on_pages_like_blocks.each do |pattern, blocks|
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
end
end
#
# Return an Array of links to follow from the given page.
# Based on whether or not the link has already been crawled,
# and the block given to focus_crawl()
#
def links_to_follow(page)
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
end
#
# Returns +true+ if *link* has not been visited already,
# and is not excluded by a skip_link pattern...
# and is not excluded by robots.txt...
# and is not deeper than the depth limit
# Returns +false+ otherwise.
#
def visit_link?(link, from_page = nil)
!@pages.has_page?(link) &&
!skip_link?(link) &&
!skip_query_string?(link) &&
allowed(link) &&
!too_deep?(from_page)
end
#
# Returns +true+ if we are obeying robots.txt and the link
# is granted access in it. Always returns +true+ when we are
# not obeying robots.txt.
#
def allowed(link)
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
end
#
# Returns +true+ if we are over the page depth limit.
# This only works when coming from a page and with the +depth_limit+ option set.
# When neither is the case, will always return +false+.
def too_deep?(from_page)
if from_page && @opts[:depth_limit]
from_page.depth >= @opts[:depth_limit]
else
false
end
end
#
# Returns +true+ if *link* should not be visited because
# it has a query string and +skip_query_strings+ is true.
#
def skip_query_string?(link)
@opts[:skip_query_strings] && link.query
end
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.
#
def skip_link?(link)
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
end
end
end

View File

@ -0,0 +1,75 @@
== 0.5.0 / 2010-09-01
* Major enhancements
* Added page storage engines for MongoDB and Redis
* Minor enhancements
* Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
* Added skip_query_strings option to skip links with query strings (Joost Baaij)
* Bug fixes
* Only consider status code 300..307 a redirect (Marc Seeger)
* Canonicalize redirect links (Marc Seeger)
== 0.4.0 / 2010-04-08
* Major enchancements
* Cookies can be accepted and sent with each HTTP request.
== 0.3.2 / 2010-02-04
* Bug fixes
* Fixed issue that allowed following redirects off the original domain
== 0.3.1 / 2010-01-22
* Minor enhancements
* Added an attr_accessor to Page for the HTTP response body
* Bug fixes
* Fixed incorrect method calls in CLI scripts
== 0.3.0 / 2009-12-15
* Major enchancements
* Option for persistent storage of pages during crawl with TokyoCabinet or PStore
* Minor enhancements
* Options can be set via methods on the Core object in the crawl block
== 0.2.3 / 2009-11-01
* Minor enhancements
* Options are now applied per-crawl, rather than module-wide.
* Bug fixes
* Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
== 0.2.2 / 2009-10-26
* Minor enhancements
* When the :verbose option is set to true, exception backtraces are printed to aid debugging.
== 0.2.1 / 2009-10-24
* Major enhancements
* Added HTTPS support.
* CLI program 'anemone', which is a frontend for several tasks.
* Minor enhancements
* HTTP request response time recorded in Page.
* Use of persistent HTTP connections.

View File

@ -0,0 +1,4 @@
Many thanks to the following folks who have contributed code to Anemone. In no particular order:
Marc Seeger
Joost Baaij

View File

@ -0,0 +1,19 @@
Copyright (c) 2009 Vertive, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -0,0 +1,3 @@
This snapshot of Anemone has been modified for use within the Metasploit Framework
* None yet, this is the initial import

View File

@ -0,0 +1,36 @@
= Anemone
Anemone is a web spider framework that can spider a domain and collect useful
information about the pages it visits. It is versatile, allowing you to
write your own specialized spider tasks quickly and easily.
See http://anemone.rubyforge.org for more information.
== Features
* Multi-threaded design for high performance
* Tracks 301 HTTP redirects
* Built-in BFS algorithm for determining page depth
* Allows exclusion of URLs based on regular expressions
* Choose the links to follow on each page with focus_crawl()
* HTTPS support
* Records response time for each page
* CLI program can list all pages in a domain, calculate page depths, and more
* Obey robots.txt
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
== Examples
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
== Requirements
* nokogiri
* robots
== Development
To test and develop this gem, additional requirements are:
* rspec
* fakeweb
* tokyocabinet
* mongo
* redis
You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.

26
lib/anemone/docs/Rakefile Normal file
View File

@ -0,0 +1,26 @@
require 'rubygems'
require 'rake'
require 'spec/rake/spectask'
Spec::Rake::SpecTask.new(:spec) do |spec|
spec.libs << 'lib' << 'spec'
spec.spec_files = FileList['spec/**/*_spec.rb']
end
Spec::Rake::SpecTask.new(:rcov) do |spec|
spec.libs << 'lib' << 'spec'
spec.pattern = 'spec/**/*_spec.rb'
spec.rcov = true
end
task :default => :spec
require 'rake/rdoctask'
Rake::RDocTask.new do |rdoc|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
rdoc.rdoc_dir = 'rdoc'
rdoc.title = "anemone #{version}"
rdoc.rdoc_files.include('README*')
rdoc.rdoc_files.include('lib/**/*.rb')
end

1
lib/anemone/docs/VERSION Normal file
View File

@ -0,0 +1 @@
0.5.0

View File

@ -0,0 +1,5 @@
module Anemone
class Error < ::StandardError
attr_accessor :wrapped_exception
end
end

157
lib/anemone/http.rb Normal file
View File

@ -0,0 +1,157 @@
require 'net/https'
require 'anemone/page'
require 'anemone/cookie_store'
module Anemone
class HTTP
# Maximum number of redirects to follow on each get_response
REDIRECT_LIMIT = 5
# CookieStore for this HTTP client
attr_reader :cookie_store
def initialize(opts = {})
@connections = {}
@opts = opts
@cookie_store = CookieStore.new(@opts[:cookies])
end
#
# Fetch a single Page from the response of an HTTP request to *url*.
# Just gets the final destination page.
#
def fetch_page(url, referer = nil, depth = nil)
fetch_pages(url, referer, depth).last
end
#
# Create new Pages from the response of an HTTP request to *url*,
# including redirects
#
def fetch_pages(url, referer = nil, depth = nil)
begin
url = URI(url) unless url.is_a?(URI)
pages = []
get(url, referer) do |response, code, location, redirect_to, response_time|
pages << Page.new(location, :body => response.body.dup,
:code => code,
:headers => response.to_hash,
:referer => referer,
:depth => depth,
:redirect_to => redirect_to,
:response_time => response_time)
end
return pages
rescue => e
if verbose?
puts e.inspect
puts e.backtrace
end
return [Page.new(url, :error => e)]
end
end
#
# The maximum number of redirects to follow
#
def redirect_limit
@opts[:redirect_limit] || REDIRECT_LIMIT
end
#
# The user-agent string which will be sent with each request,
# or nil if no such option is set
#
def user_agent
@opts[:user_agent]
end
#
# Does this HTTP client accept cookies from the server?
#
def accept_cookies?
@opts[:accept_cookies]
end
private
#
# Retrieve HTTP responses for *url*, including redirects.
# Yields the response object, response code, and URI location
# for each response.
#
def get(url, referer = nil)
limit = redirect_limit
loc = url
begin
# if redirected to a relative url, merge it with the host of the original
# request url
loc = url.merge(loc) if loc.relative?
response, response_time = get_response(loc, referer)
code = Integer(response.code)
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
yield response, code, loc, redirect_to, response_time
limit -= 1
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
end
#
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
#
def get_response(url, referer = nil)
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
opts = {}
opts['User-Agent'] = user_agent if user_agent
opts['Referer'] = referer.to_s if referer
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
retries = 0
begin
start = Time.now()
response = connection(url).get(full_path, opts)
finish = Time.now()
response_time = ((finish - start) * 1000).round
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
return response, response_time
rescue EOFError
refresh_connection(url)
retries += 1
retry unless retries > 3
end
end
def connection(url)
@connections[url.host] ||= {}
if conn = @connections[url.host][url.port]
return conn
end
refresh_connection url
end
def refresh_connection(url)
http = Net::HTTP.new(url.host, url.port)
if url.scheme == 'https'
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
@connections[url.host][url.port] = http.start
end
def verbose?
@opts[:verbose]
end
#
# Allowed to connect to the requested url?
#
def allowed?(to_url, from_url)
to_url.host.nil? || (to_url.host == from_url.host)
end
end
end

202
lib/anemone/page.rb Normal file
View File

@ -0,0 +1,202 @@
require 'nokogiri'
require 'ostruct'
require 'webrick/cookie'
module Anemone
class Page
# The URL of the page
attr_reader :url
# The raw HTTP response body of the page
attr_reader :body
# Headers of the HTTP response
attr_reader :headers
# URL of the page this one redirected to, if any
attr_reader :redirect_to
# Exception object, if one was raised during HTTP#fetch_page
attr_reader :error
# OpenStruct for user-stored data
attr_accessor :data
# Integer response code of the page
attr_accessor :code
# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
attr_accessor :visited
# Depth of this page from the root of the crawl. This is not necessarily the
# shortest path; use PageStore#shortest_paths! to find that value.
attr_accessor :depth
# URL of the page that brought us to this page
attr_accessor :referer
# Response time of the request for this page in milliseconds
attr_accessor :response_time
#
# Create a new page
#
def initialize(url, params = {})
@url = url
@data = OpenStruct.new
@code = params[:code]
@headers = params[:headers] || {}
@headers['content-type'] ||= ['']
@aliases = Array(params[:aka]).compact
@referer = params[:referer]
@depth = params[:depth] || 0
@redirect_to = to_absolute(params[:redirect_to])
@response_time = params[:response_time]
@body = params[:body]
@error = params[:error]
@fetched = !params[:code].nil?
end
#
# Array of distinct A tag HREFs from the page
#
def links
return @links unless @links.nil?
@links = []
return @links if !doc
doc.search("//a[@href]").each do |a|
u = a['href']
next if u.nil? or u.empty?
abs = to_absolute(URI(u)) rescue next
@links << abs if in_domain?(abs)
end
@links.uniq!
@links
end
#
# Nokogiri document for the HTML body
#
def doc
return @doc if @doc
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
end
#
# Delete the Nokogiri document and response body to conserve memory
#
def discard_doc!
links # force parsing of page links before we trash the document
@doc = @body = nil
end
#
# Was the page successfully fetched?
# +true+ if the page was fetched with no error, +false+ otherwise.
#
def fetched?
@fetched
end
#
# Array of cookies received with this page as WEBrick::Cookie objects.
#
def cookies
WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
end
#
# The content-type returned by the HTTP request for this page
#
def content_type
headers['content-type'].first
end
#
# Returns +true+ if the page is a HTML document, returns +false+
# otherwise.
#
def html?
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end
#
# Returns +true+ if the page is a HTTP redirect, returns +false+
# otherwise.
#
def redirect?
(300..307).include?(@code)
end
#
# Returns +true+ if the page was not found (returned 404 code),
# returns +false+ otherwise.
#
def not_found?
404 == @code
end
#
# Converts relative URL *link* into an absolute URL based on the
# location of the page
#
def to_absolute(link)
return nil if link.nil?
# remove anchor
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
relative = URI(link)
absolute = @url.merge(relative)
absolute.path = '/' if absolute.path.empty?
return absolute
end
#
# Returns +true+ if *uri* is in the same domain as the page, returns
# +false+ otherwise
#
def in_domain?(uri)
uri.host == @url.host
end
def marshal_dump
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end
def marshal_load(ary)
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end
def to_hash
{'url' => @url.to_s,
'headers' => Marshal.dump(@headers),
'data' => Marshal.dump(@data),
'body' => @body,
'links' => links.map(&:to_s),
'code' => @code,
'visited' => @visited,
'depth' => @depth,
'referer' => @referer.to_s,
'redirect_to' => @redirect_to.to_s,
'response_time' => @response_time,
'fetched' => @fetched}
end
def self.from_hash(hash)
page = self.new(URI(hash['url']))
{'@headers' => Marshal.load(hash['headers']),
'@data' => Marshal.load(hash['data']),
'@body' => hash['body'],
'@links' => hash['links'].map { |link| URI(link) },
'@code' => hash['code'].to_i,
'@visited' => hash['visited'],
'@depth' => hash['depth'].to_i,
'@referer' => hash['referer'],
'@redirect_to' => URI(hash['redirect_to']),
'@response_time' => hash['response_time'].to_i,
'@fetched' => hash['fetched']
}.each do |var, value|
page.instance_variable_set(var, value)
end
page
end
end
end

160
lib/anemone/page_store.rb Normal file
View File

@ -0,0 +1,160 @@
require 'forwardable'
module Anemone
class PageStore
extend Forwardable
def_delegators :@storage, :keys, :values, :size, :each
def initialize(storage = {})
@storage = storage
end
# We typically index the hash with a URI,
# but convert it to a String for easier retrieval
def [](index)
@storage[index.to_s]
end
def []=(index, other)
@storage[index.to_s] = other
end
def delete(key)
@storage.delete key.to_s
end
def has_key?(key)
@storage.has_key? key.to_s
end
def each_value
each { |key, value| yield value }
end
def values
result = []
each { |key, value| result << value }
result
end
def touch_key(key)
self[key] = Page.new(key)
end
def touch_keys(keys)
@storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
end
# Does this PageStore contain the specified URL?
# HTTP and HTTPS versions of a URL are considered to be the same page.
def has_page?(url)
schemes = %w(http https)
if schemes.include? url.scheme
u = url.dup
return schemes.any? { |s| u.scheme = s; has_key?(u) }
end
has_key? url
end
#
# Use a breadth-first search to calculate the single-source
# shortest paths from *root* to all pages in the PageStore
#
def shortest_paths!(root)
root = URI(root) if root.is_a?(String)
raise "Root node not found" if !has_key?(root)
q = Queue.new
q.enq root
root_page = self[root]
root_page.depth = 0
root_page.visited = true
self[root] = root_page
while !q.empty?
page = self[q.deq]
page.links.each do |u|
begin
link = self[u]
next if link.nil? || !link.fetched? || link.visited
q << u unless link.redirect?
link.visited = true
link.depth = page.depth + 1
self[u] = link
if link.redirect?
u = link.redirect_to
redo
end
end
end
end
self
end
#
# Removes all Pages from storage where redirect? is true
#
def uniq!
each_value { |page| delete page.url if page.redirect? }
self
end
#
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
#
def pages_linking_to(urls)
unless urls.is_a?(Array)
urls = [urls]
single = true
end
urls.map! do |url|
unless url.is_a?(URI)
URI(url) rescue nil
else
url
end
end
urls.compact
links = {}
urls.each { |url| links[url] = [] }
values.each do |page|
urls.each { |url| links[url] << page if page.links.include?(url) }
end
if single and !links.empty?
return links[urls.first]
else
return links
end
end
#
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
#
def urls_linking_to(urls)
unless urls.is_a?(Array)
urls = [urls] unless urls.is_a?(Array)
single = true
end
links = pages_linking_to(urls)
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
if single and !links.empty?
return links[urls.first]
else
return links
end
end
end
end

34
lib/anemone/storage.rb Normal file
View File

@ -0,0 +1,34 @@
module Anemone
module Storage
def self.Hash(*args)
hash = Hash.new(*args)
# add close method for compatibility with Storage::Base
class << hash; def close; end; end
hash
end
def self.PStore(*args)
require 'anemone/storage/pstore'
self::PStore.new(*args)
end
def self.TokyoCabinet(file = 'anemone.tch')
require 'anemone/storage/tokyo_cabinet'
self::TokyoCabinet.new(file)
end
def self.MongoDB(mongo_db = nil, collection_name = 'pages')
require 'anemone/storage/mongodb'
mongo_db ||= Mongo::Connection.new.db('anemone')
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
self::MongoDB.new(mongo_db, collection_name)
end
def self.Redis(opts = {})
require 'anemone/storage/redis'
self::Redis.new(opts)
end
end
end

View File

@ -0,0 +1,75 @@
require 'anemone/storage/exceptions'
module Anemone
module Storage
class Base
def initialize(adapter)
@adap = adapter
# verify adapter conforms to this class's methods
methods.each do |method|
if !@adap.respond_to?(method.to_sym)
raise "Storage adapter must support method #{method}"
end
end
end
def [](key)
@adap[key]
rescue
puts key
raise RetrievalError, $!
end
def []=(key, value)
@adap[key] = value
rescue
raise InsertionError, $!
end
def delete(key)
@adap.delete(key)
rescue
raise DeletionError, $!
end
def each
@adap.each { |k, v| yield k, v }
rescue
raise GenericError, $!
end
def merge!(hash)
@adap.merge!(hash)
rescue
raise GenericError, $!
end
def close
@adap.close
rescue
raise CloseError, $!
end
def size
@adap.size
rescue
raise GenericError, $!
end
def keys
@adap.keys
rescue
raise GenericError, $!
end
def has_key?(key)
@adap.has_key?(key)
rescue
raise GenericError, $!
end
end
end
end

View File

@ -0,0 +1,15 @@
module Anemone
module Storage
class GenericError < Error; end;
class ConnectionError < Error; end
class RetrievalError < Error; end
class InsertionError < Error; end
class CloseError < Error; end
end
end

View File

@ -0,0 +1,89 @@
begin
require 'mongo'
rescue LoadError
puts "You need the mongo gem to use Anemone::Storage::MongoDB"
exit
end
module Anemone
module Storage
class MongoDB
BINARY_FIELDS = %w(body headers data)
def initialize(mongo_db, collection_name)
@db = mongo_db
@collection = @db[collection_name]
@collection.remove
@collection.create_index 'url'
end
def [](url)
if value = @collection.find_one('url' => url.to_s)
load_page(value)
end
end
def []=(url, page)
hash = page.to_hash
BINARY_FIELDS.each do |field|
hash[field] = BSON::Binary.new(hash[field]) unless hash[field].nil?
end
@collection.update(
{'url' => page.url.to_s},
hash,
:upsert => true
)
end
def delete(url)
page = self[url]
@collection.remove('url' => url.to_s)
page
end
def each
@collection.find do |cursor|
cursor.each do |doc|
page = load_page(doc)
yield page.url.to_s, page
end
end
end
def merge!(hash)
hash.each { |key, value| self[key] = value }
self
end
def size
@collection.count
end
def keys
keys = []
self.each { |k, v| keys << k.to_s }
keys
end
def has_key?(url)
!!@collection.find_one('url' => url.to_s)
end
def close
@db.connection.close
end
private
def load_page(hash)
BINARY_FIELDS.each do |field|
hash[field] = hash[field].to_s
end
Page.from_hash(hash)
end
end
end
end

View File

@ -0,0 +1,50 @@
require 'pstore'
require 'forwardable'
module Anemone
module Storage
class PStore
extend Forwardable
def_delegators :@keys, :has_key?, :keys, :size
def initialize(file)
File.delete(file) if File.exists?(file)
@store = ::PStore.new(file)
@keys = {}
end
def [](key)
@store.transaction { |s| s[key] }
end
def []=(key,value)
@keys[key] = nil
@store.transaction { |s| s[key] = value }
end
def delete(key)
@keys.delete(key)
@store.transaction { |s| s.delete key}
end
def each
@keys.each_key do |key|
value = nil
@store.transaction { |s| value = s[key] }
yield key, value
end
end
def merge!(hash)
@store.transaction do |s|
hash.each { |key, value| s[key] = value; @keys[key] = nil }
end
self
end
def close; end
end
end
end

View File

@ -0,0 +1,90 @@
require 'redis'
module Anemone
module Storage
class Redis
MARSHAL_FIELDS = %w(links visited fetched)
def initialize(opts = {})
@redis = ::Redis.new(opts)
@key_prefix = opts[:key_prefix] || 'anemone'
keys.each { |key| delete(key) }
end
def [](key)
rkey = "#{@key_prefix}:pages:#{key.to_s}"
rget(rkey)
end
def []=(key, value)
rkey = "#{@key_prefix}:pages:#{key.to_s}"
hash = value.to_hash
MARSHAL_FIELDS.each do |field|
hash[field] = Marshal.dump(hash[field])
end
hash.each do |field, value|
@redis.hset(rkey, field, value)
end
end
def delete(key)
rkey = "#{@key_prefix}:pages:#{key.to_s}"
page = self[key]
@redis.del(rkey)
page
end
def each
rkeys = @redis.keys("#{@key_prefix}:pages:*")
rkeys.each do |rkey|
page = rget(rkey)
yield page.url.to_s, page
end
end
def merge!(hash)
hash.each { |key, value| self[key] = value }
self
end
def size
@redis.keys("#{@key_prefix}:pages:*").size
end
def keys
keys = []
self.each { |k, v| keys << k.to_s }
keys
end
def has_key?(key)
rkey = "#{@key_prefix}:pages:#{key.to_s}"
@redis.exists(rkey)
end
def close
@redis.quit
end
private
def load_value(hash)
MARSHAL_FIELDS.each do |field|
unless hash[field].nil? || hash[field] == ''
hash[field] = Marshal.load(hash[field])
end
end
Page.from_hash(hash)
end
def rget(rkey)
hash = @redis.hgetall(rkey)
if !!hash
load_value(hash)
end
end
end
end
end

View File

@ -0,0 +1,57 @@
begin
require 'tokyocabinet'
rescue LoadError
puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
exit
end
require 'forwardable'
module Anemone
module Storage
class TokyoCabinet
extend Forwardable
def_delegators :@db, :close, :size, :keys, :has_key?
def initialize(file)
raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
@db = ::TokyoCabinet::HDB::new
@db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
@db.clear
end
def [](key)
if value = @db[key]
load_value(value)
end
end
def []=(key, value)
@db[key] = [Marshal.dump(value)].pack("m")
end
def delete(key)
value = self[key]
@db.delete(key)
value
end
def each
@db.each { |k, v| yield k, load_value(v) }
end
def merge!(hash)
hash.each { |key, value| self[key] = value }
self
end
private
def load_value(value)
Marshal.load(value.unpack("m")[0])
end
end
end
end

39
lib/anemone/tentacle.rb Normal file
View File

@ -0,0 +1,39 @@
require 'anemone/http'
module Anemone
class Tentacle
#
# Create a new Tentacle
#
def initialize(link_queue, page_queue, opts = {})
@link_queue = link_queue
@page_queue = page_queue
@http = Anemone::HTTP.new(opts)
@opts = opts
end
#
# Gets links from @link_queue, and returns the fetched
# Page objects into @page_queue
#
def run
loop do
link, referer, depth = @link_queue.deq
break if link == :END
@http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
delay
end
end
private
def delay
sleep @opts[:delay] if @opts[:delay] > 0
end
end
end