Initial import of an Anemone snapshot
git-svn-id: file:///home/svn/framework3/trunk@10924 4d416f70-5f16-0410-b530-b9f4589650daunstable
parent
f6415186a8
commit
b3cc6e19b6
|
@ -0,0 +1,2 @@
|
|||
require 'rubygems'
|
||||
require 'anemone/core'
|
|
@ -0,0 +1,24 @@
|
|||
module Anemone
|
||||
module CLI
|
||||
COMMANDS = %w[count cron pagedepth serialize url-list]
|
||||
|
||||
def self.run
|
||||
command = ARGV.shift
|
||||
|
||||
if COMMANDS.include? command
|
||||
load "anemone/cli/#{command.tr('-', '_')}.rb"
|
||||
else
|
||||
puts <<-INFO
|
||||
Anemone is a web spider framework that can collect
|
||||
useful information about pages it visits.
|
||||
|
||||
Usage:
|
||||
anemone <command> [arguments]
|
||||
|
||||
Commands:
|
||||
#{COMMANDS.join(', ')}
|
||||
INFO
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,22 @@
|
|||
require 'anemone'
|
||||
|
||||
begin
|
||||
# make sure that the first option is a URL we can crawl
|
||||
url = URI(ARGV[0])
|
||||
rescue
|
||||
puts <<-INFO
|
||||
Usage:
|
||||
anemone count <url>
|
||||
|
||||
Synopsis:
|
||||
Crawls a site starting at the given URL and outputs the total number
|
||||
of unique pages on the site.
|
||||
INFO
|
||||
exit(0)
|
||||
end
|
||||
|
||||
Anemone.crawl(url) do |anemone|
|
||||
anemone.after_crawl do |pages|
|
||||
puts pages.uniq!.size
|
||||
end
|
||||
end
|
|
@ -0,0 +1,90 @@
|
|||
require 'anemone'
|
||||
require 'optparse'
|
||||
require 'ostruct'
|
||||
|
||||
options = OpenStruct.new
|
||||
options.relative = false
|
||||
options.output_file = 'urls.txt'
|
||||
|
||||
begin
|
||||
# make sure that the last argument is a URL we can crawl
|
||||
root = URI(ARGV.last)
|
||||
rescue
|
||||
puts <<-INFO
|
||||
Usage:
|
||||
anemone cron [options] <url>
|
||||
|
||||
Synopsis:
|
||||
Combination of `count`, `pagedepth` and `url-list` commands.
|
||||
Performs pagedepth, url list, and count functionality.
|
||||
Outputs results to STDOUT and link list to file (urls.txt).
|
||||
Meant to be run daily as a cron job.
|
||||
|
||||
Options:
|
||||
-r, --relative Output relative URLs (rather than absolute)
|
||||
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
||||
INFO
|
||||
exit(0)
|
||||
end
|
||||
|
||||
# parse command-line options
|
||||
opts = OptionParser.new
|
||||
opts.on('-r', '--relative') { options.relative = true }
|
||||
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
||||
opts.parse!(ARGV)
|
||||
|
||||
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
||||
|
||||
anemone.after_crawl do |pages|
|
||||
puts "Crawl results for #{root}\n"
|
||||
|
||||
# print a list of 404's
|
||||
not_found = []
|
||||
pages.each_value do |page|
|
||||
url = page.url.to_s
|
||||
not_found << url if page.not_found?
|
||||
end
|
||||
unless not_found.empty?
|
||||
puts "\n404's:"
|
||||
|
||||
missing_links = pages.urls_linking_to(not_found)
|
||||
missing_links.each do |url, links|
|
||||
if options.relative
|
||||
puts URI(url).path.to_s
|
||||
else
|
||||
puts url
|
||||
end
|
||||
links.slice(0..10).each do |u|
|
||||
u = u.path if options.relative
|
||||
puts " linked from #{u}"
|
||||
end
|
||||
|
||||
puts " ..." if links.size > 10
|
||||
end
|
||||
|
||||
print "\n"
|
||||
end
|
||||
|
||||
# remove redirect aliases, and calculate pagedepths
|
||||
pages = pages.shortest_paths!(root).uniq
|
||||
depths = pages.values.inject({}) do |depths, page|
|
||||
depths[page.depth] ||= 0
|
||||
depths[page.depth] += 1
|
||||
depths
|
||||
end
|
||||
|
||||
# print the page count
|
||||
puts "Total pages: #{pages.size}\n"
|
||||
|
||||
# print a list of depths
|
||||
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
||||
|
||||
# output a list of urls to file
|
||||
file = open(options.output_file, 'w')
|
||||
pages.each_key do |url|
|
||||
url = options.relative ? url.path.to_s : url.to_s
|
||||
file.puts url
|
||||
end
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,32 @@
|
|||
require 'anemone'
|
||||
|
||||
begin
|
||||
# make sure that the first option is a URL we can crawl
|
||||
root = URI(ARGV[0])
|
||||
rescue
|
||||
puts <<-INFO
|
||||
Usage:
|
||||
anemone pagedepth <url>
|
||||
|
||||
Synopsis:
|
||||
Crawls a site starting at the given URL and outputs a count of
|
||||
the number of pages at each depth of the crawl.
|
||||
INFO
|
||||
exit(0)
|
||||
end
|
||||
|
||||
Anemone.crawl(root) do |anemone|
|
||||
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
||||
|
||||
anemone.after_crawl do |pages|
|
||||
pages = pages.shortest_paths!(root).uniq!
|
||||
|
||||
depths = pages.values.inject({}) do |depths, page|
|
||||
depths[page.depth] ||= 0
|
||||
depths[page.depth] += 1
|
||||
depths
|
||||
end
|
||||
|
||||
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
||||
end
|
||||
end
|
|
@ -0,0 +1,35 @@
|
|||
require 'anemone'
|
||||
require 'optparse'
|
||||
require 'ostruct'
|
||||
|
||||
begin
|
||||
# make sure that the first option is a URL we can crawl
|
||||
root = URI(ARGV[0])
|
||||
rescue
|
||||
puts <<-INFO
|
||||
Usage:
|
||||
anemone serialize [options] <url>
|
||||
|
||||
Synopsis:
|
||||
Crawls a site starting at the given URL and saves the resulting
|
||||
PageStore object to a file using Marshal serialization.
|
||||
|
||||
Options:
|
||||
-o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
|
||||
INFO
|
||||
exit(0)
|
||||
end
|
||||
|
||||
options = OpenStruct.new
|
||||
options.output_file = "crawl.#{Time.now.to_i}"
|
||||
|
||||
# parse command-line options
|
||||
opts = OptionParser.new
|
||||
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
||||
opts.parse!(ARGV)
|
||||
|
||||
Anemone.crawl(root) do |anemone|
|
||||
anemone.after_crawl do |pages|
|
||||
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
||||
end
|
||||
end
|
|
@ -0,0 +1,41 @@
|
|||
require 'anemone'
|
||||
require 'optparse'
|
||||
require 'ostruct'
|
||||
|
||||
options = OpenStruct.new
|
||||
options.relative = false
|
||||
|
||||
begin
|
||||
# make sure that the last option is a URL we can crawl
|
||||
root = URI(ARGV.last)
|
||||
rescue
|
||||
puts <<-INFO
|
||||
Usage:
|
||||
anemone url-list [options] <url>
|
||||
|
||||
Synopsis:
|
||||
Crawls a site starting at the given URL, and outputs the URL of each page
|
||||
in the domain as they are encountered.
|
||||
|
||||
Options:
|
||||
-r, --relative Output relative URLs (rather than absolute)
|
||||
INFO
|
||||
exit(0)
|
||||
end
|
||||
|
||||
# parse command-line options
|
||||
opts = OptionParser.new
|
||||
opts.on('-r', '--relative') { options.relative = true }
|
||||
opts.parse!(ARGV)
|
||||
|
||||
Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
|
||||
|
||||
anemone.on_every_page do |page|
|
||||
if options.relative
|
||||
puts page.url.path
|
||||
else
|
||||
puts page.url
|
||||
end
|
||||
end
|
||||
|
||||
end
|
|
@ -0,0 +1,35 @@
|
|||
require 'delegate'
|
||||
require 'webrick/cookie'
|
||||
|
||||
class WEBrick::Cookie
|
||||
def expired?
|
||||
!!expires && expires < Time.now
|
||||
end
|
||||
end
|
||||
|
||||
module Anemone
|
||||
class CookieStore < DelegateClass(Hash)
|
||||
|
||||
def initialize(cookies = nil)
|
||||
@cookies = {}
|
||||
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
|
||||
super(@cookies)
|
||||
end
|
||||
|
||||
def merge!(set_cookie_str)
|
||||
begin
|
||||
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
|
||||
hash[cookie.name] = cookie if !!cookie
|
||||
hash
|
||||
end
|
||||
@cookies.merge! cookie_hash
|
||||
rescue
|
||||
end
|
||||
end
|
||||
|
||||
def to_s
|
||||
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
|
||||
end
|
||||
|
||||
end
|
||||
end
|
|
@ -0,0 +1,294 @@
|
|||
require 'thread'
|
||||
require 'robots'
|
||||
require 'anemone/tentacle'
|
||||
require 'anemone/page'
|
||||
require 'anemone/exceptions'
|
||||
require 'anemone/page_store'
|
||||
require 'anemone/storage'
|
||||
require 'anemone/storage/base'
|
||||
|
||||
module Anemone
|
||||
|
||||
VERSION = '0.5.0';
|
||||
|
||||
#
|
||||
# Convenience method to start a crawl
|
||||
#
|
||||
def Anemone.crawl(urls, options = {}, &block)
|
||||
Core.crawl(urls, options, &block)
|
||||
end
|
||||
|
||||
class Core
|
||||
|
||||
# PageStore storing all Page objects encountered during the crawl
|
||||
attr_reader :pages
|
||||
# Hash of options for the crawl
|
||||
attr_reader :opts
|
||||
|
||||
DEFAULT_OPTS = {
|
||||
# run 4 Tentacle threads to fetch pages
|
||||
:threads => 4,
|
||||
# disable verbose output
|
||||
:verbose => false,
|
||||
# don't throw away the page response body after scanning it for links
|
||||
:discard_page_bodies => false,
|
||||
# identify self as Anemone/VERSION
|
||||
:user_agent => "Anemone/#{Anemone::VERSION}",
|
||||
# no delay between requests
|
||||
:delay => 0,
|
||||
# don't obey the robots exclusion protocol
|
||||
:obey_robots_txt => false,
|
||||
# by default, don't limit the depth of the crawl
|
||||
:depth_limit => false,
|
||||
# number of times HTTP redirects will be followed
|
||||
:redirect_limit => 5,
|
||||
# storage engine defaults to Hash in +process_options+ if none specified
|
||||
:storage => nil,
|
||||
# Hash of cookie name => value to send with HTTP requests
|
||||
:cookies => nil,
|
||||
# accept cookies from the server and send them back?
|
||||
:accept_cookies => false,
|
||||
# skip any link with a query string? e.g. http://foo.com/?u=user
|
||||
:skip_query_strings => false
|
||||
}
|
||||
|
||||
# Create setter methods for all options to be called from the crawl block
|
||||
DEFAULT_OPTS.keys.each do |key|
|
||||
define_method "#{key}=" do |value|
|
||||
@opts[key.to_sym] = value
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
||||
# and optional *block*
|
||||
#
|
||||
def initialize(urls, opts = {})
|
||||
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
||||
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
||||
|
||||
@tentacles = []
|
||||
@on_every_page_blocks = []
|
||||
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
||||
@skip_link_patterns = []
|
||||
@after_crawl_blocks = []
|
||||
@opts = opts
|
||||
|
||||
yield self if block_given?
|
||||
end
|
||||
|
||||
#
|
||||
# Convenience method to start a new crawl
|
||||
#
|
||||
def self.crawl(urls, opts = {})
|
||||
self.new(urls, opts) do |core|
|
||||
yield core if block_given?
|
||||
core.run
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Add a block to be executed on the PageStore after the crawl
|
||||
# is finished
|
||||
#
|
||||
def after_crawl(&block)
|
||||
@after_crawl_blocks << block
|
||||
self
|
||||
end
|
||||
|
||||
#
|
||||
# Add one ore more Regex patterns for URLs which should not be
|
||||
# followed
|
||||
#
|
||||
def skip_links_like(*patterns)
|
||||
@skip_link_patterns.concat [patterns].flatten.compact
|
||||
self
|
||||
end
|
||||
|
||||
#
|
||||
# Add a block to be executed on every Page as they are encountered
|
||||
# during the crawl
|
||||
#
|
||||
def on_every_page(&block)
|
||||
@on_every_page_blocks << block
|
||||
self
|
||||
end
|
||||
|
||||
#
|
||||
# Add a block to be executed on Page objects with a URL matching
|
||||
# one or more patterns
|
||||
#
|
||||
def on_pages_like(*patterns, &block)
|
||||
if patterns
|
||||
patterns.each do |pattern|
|
||||
@on_pages_like_blocks[pattern] << block
|
||||
end
|
||||
end
|
||||
self
|
||||
end
|
||||
|
||||
#
|
||||
# Specify a block which will select which links to follow on each page.
|
||||
# The block should return an Array of URI objects.
|
||||
#
|
||||
def focus_crawl(&block)
|
||||
@focus_crawl_block = block
|
||||
self
|
||||
end
|
||||
|
||||
#
|
||||
# Perform the crawl
|
||||
#
|
||||
def run
|
||||
process_options
|
||||
|
||||
@urls.delete_if { |url| !visit_link?(url) }
|
||||
return if @urls.empty?
|
||||
|
||||
link_queue = Queue.new
|
||||
page_queue = Queue.new
|
||||
|
||||
@opts[:threads].times do
|
||||
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
||||
end
|
||||
|
||||
@urls.each{ |url| link_queue.enq(url) }
|
||||
|
||||
loop do
|
||||
page = page_queue.deq
|
||||
@pages.touch_key page.url
|
||||
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
||||
do_page_blocks page
|
||||
page.discard_doc! if @opts[:discard_page_bodies]
|
||||
|
||||
links = links_to_follow page
|
||||
links.each do |link|
|
||||
link_queue << [link, page.url.dup, page.depth + 1]
|
||||
end
|
||||
@pages.touch_keys links
|
||||
|
||||
@pages[page.url] = page
|
||||
|
||||
# if we are done with the crawl, tell the threads to end
|
||||
if link_queue.empty? and page_queue.empty?
|
||||
until link_queue.num_waiting == @tentacles.size
|
||||
Thread.pass
|
||||
end
|
||||
if page_queue.empty?
|
||||
@tentacles.size.times { link_queue << :END }
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@tentacles.each { |thread| thread.join }
|
||||
do_after_crawl_blocks
|
||||
self
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def process_options
|
||||
@opts = DEFAULT_OPTS.merge @opts
|
||||
@opts[:threads] = 1 if @opts[:delay] > 0
|
||||
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
|
||||
@pages = PageStore.new(storage)
|
||||
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
||||
|
||||
freeze_options
|
||||
end
|
||||
|
||||
#
|
||||
# Freeze the opts Hash so that no options can be modified
|
||||
# once the crawl begins
|
||||
#
|
||||
def freeze_options
|
||||
@opts.freeze
|
||||
@opts.each_key { |key| @opts[key].freeze }
|
||||
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
|
||||
end
|
||||
|
||||
#
|
||||
# Execute the after_crawl blocks
|
||||
#
|
||||
def do_after_crawl_blocks
|
||||
@after_crawl_blocks.each { |block| block.call(@pages) }
|
||||
end
|
||||
|
||||
#
|
||||
# Execute the on_every_page blocks for *page*
|
||||
#
|
||||
def do_page_blocks(page)
|
||||
@on_every_page_blocks.each do |block|
|
||||
block.call(page)
|
||||
end
|
||||
|
||||
@on_pages_like_blocks.each do |pattern, blocks|
|
||||
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Return an Array of links to follow from the given page.
|
||||
# Based on whether or not the link has already been crawled,
|
||||
# and the block given to focus_crawl()
|
||||
#
|
||||
def links_to_follow(page)
|
||||
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
||||
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if *link* has not been visited already,
|
||||
# and is not excluded by a skip_link pattern...
|
||||
# and is not excluded by robots.txt...
|
||||
# and is not deeper than the depth limit
|
||||
# Returns +false+ otherwise.
|
||||
#
|
||||
def visit_link?(link, from_page = nil)
|
||||
!@pages.has_page?(link) &&
|
||||
!skip_link?(link) &&
|
||||
!skip_query_string?(link) &&
|
||||
allowed(link) &&
|
||||
!too_deep?(from_page)
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if we are obeying robots.txt and the link
|
||||
# is granted access in it. Always returns +true+ when we are
|
||||
# not obeying robots.txt.
|
||||
#
|
||||
def allowed(link)
|
||||
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if we are over the page depth limit.
|
||||
# This only works when coming from a page and with the +depth_limit+ option set.
|
||||
# When neither is the case, will always return +false+.
|
||||
def too_deep?(from_page)
|
||||
if from_page && @opts[:depth_limit]
|
||||
from_page.depth >= @opts[:depth_limit]
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if *link* should not be visited because
|
||||
# it has a query string and +skip_query_strings+ is true.
|
||||
#
|
||||
def skip_query_string?(link)
|
||||
@opts[:skip_query_strings] && link.query
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if *link* should not be visited because
|
||||
# its URL matches a skip_link pattern.
|
||||
#
|
||||
def skip_link?(link)
|
||||
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
|
||||
end
|
||||
|
||||
end
|
||||
end
|
|
@ -0,0 +1,75 @@
|
|||
== 0.5.0 / 2010-09-01
|
||||
|
||||
* Major enhancements
|
||||
|
||||
* Added page storage engines for MongoDB and Redis
|
||||
|
||||
* Minor enhancements
|
||||
|
||||
* Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
|
||||
* Added skip_query_strings option to skip links with query strings (Joost Baaij)
|
||||
|
||||
* Bug fixes
|
||||
|
||||
* Only consider status code 300..307 a redirect (Marc Seeger)
|
||||
* Canonicalize redirect links (Marc Seeger)
|
||||
|
||||
== 0.4.0 / 2010-04-08
|
||||
|
||||
* Major enchancements
|
||||
|
||||
* Cookies can be accepted and sent with each HTTP request.
|
||||
|
||||
== 0.3.2 / 2010-02-04
|
||||
|
||||
* Bug fixes
|
||||
|
||||
* Fixed issue that allowed following redirects off the original domain
|
||||
|
||||
== 0.3.1 / 2010-01-22
|
||||
|
||||
* Minor enhancements
|
||||
|
||||
* Added an attr_accessor to Page for the HTTP response body
|
||||
|
||||
* Bug fixes
|
||||
|
||||
* Fixed incorrect method calls in CLI scripts
|
||||
|
||||
== 0.3.0 / 2009-12-15
|
||||
|
||||
* Major enchancements
|
||||
|
||||
* Option for persistent storage of pages during crawl with TokyoCabinet or PStore
|
||||
|
||||
* Minor enhancements
|
||||
|
||||
* Options can be set via methods on the Core object in the crawl block
|
||||
|
||||
== 0.2.3 / 2009-11-01
|
||||
|
||||
* Minor enhancements
|
||||
|
||||
* Options are now applied per-crawl, rather than module-wide.
|
||||
|
||||
* Bug fixes
|
||||
|
||||
* Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
|
||||
|
||||
== 0.2.2 / 2009-10-26
|
||||
|
||||
* Minor enhancements
|
||||
|
||||
* When the :verbose option is set to true, exception backtraces are printed to aid debugging.
|
||||
|
||||
== 0.2.1 / 2009-10-24
|
||||
|
||||
* Major enhancements
|
||||
|
||||
* Added HTTPS support.
|
||||
* CLI program 'anemone', which is a frontend for several tasks.
|
||||
|
||||
* Minor enhancements
|
||||
|
||||
* HTTP request response time recorded in Page.
|
||||
* Use of persistent HTTP connections.
|
|
@ -0,0 +1,4 @@
|
|||
Many thanks to the following folks who have contributed code to Anemone. In no particular order:
|
||||
|
||||
Marc Seeger
|
||||
Joost Baaij
|
|
@ -0,0 +1,19 @@
|
|||
Copyright (c) 2009 Vertive, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
|
@ -0,0 +1,3 @@
|
|||
This snapshot of Anemone has been modified for use within the Metasploit Framework
|
||||
|
||||
* None yet, this is the initial import
|
|
@ -0,0 +1,36 @@
|
|||
= Anemone
|
||||
|
||||
Anemone is a web spider framework that can spider a domain and collect useful
|
||||
information about the pages it visits. It is versatile, allowing you to
|
||||
write your own specialized spider tasks quickly and easily.
|
||||
|
||||
See http://anemone.rubyforge.org for more information.
|
||||
|
||||
== Features
|
||||
* Multi-threaded design for high performance
|
||||
* Tracks 301 HTTP redirects
|
||||
* Built-in BFS algorithm for determining page depth
|
||||
* Allows exclusion of URLs based on regular expressions
|
||||
* Choose the links to follow on each page with focus_crawl()
|
||||
* HTTPS support
|
||||
* Records response time for each page
|
||||
* CLI program can list all pages in a domain, calculate page depths, and more
|
||||
* Obey robots.txt
|
||||
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
|
||||
|
||||
== Examples
|
||||
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
||||
|
||||
== Requirements
|
||||
* nokogiri
|
||||
* robots
|
||||
|
||||
== Development
|
||||
To test and develop this gem, additional requirements are:
|
||||
* rspec
|
||||
* fakeweb
|
||||
* tokyocabinet
|
||||
* mongo
|
||||
* redis
|
||||
|
||||
You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
|
|
@ -0,0 +1,26 @@
|
|||
require 'rubygems'
|
||||
require 'rake'
|
||||
|
||||
require 'spec/rake/spectask'
|
||||
Spec::Rake::SpecTask.new(:spec) do |spec|
|
||||
spec.libs << 'lib' << 'spec'
|
||||
spec.spec_files = FileList['spec/**/*_spec.rb']
|
||||
end
|
||||
|
||||
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
||||
spec.libs << 'lib' << 'spec'
|
||||
spec.pattern = 'spec/**/*_spec.rb'
|
||||
spec.rcov = true
|
||||
end
|
||||
|
||||
task :default => :spec
|
||||
|
||||
require 'rake/rdoctask'
|
||||
Rake::RDocTask.new do |rdoc|
|
||||
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
||||
|
||||
rdoc.rdoc_dir = 'rdoc'
|
||||
rdoc.title = "anemone #{version}"
|
||||
rdoc.rdoc_files.include('README*')
|
||||
rdoc.rdoc_files.include('lib/**/*.rb')
|
||||
end
|
|
@ -0,0 +1 @@
|
|||
0.5.0
|
|
@ -0,0 +1,5 @@
|
|||
module Anemone
|
||||
class Error < ::StandardError
|
||||
attr_accessor :wrapped_exception
|
||||
end
|
||||
end
|
|
@ -0,0 +1,157 @@
|
|||
require 'net/https'
|
||||
require 'anemone/page'
|
||||
require 'anemone/cookie_store'
|
||||
|
||||
module Anemone
|
||||
class HTTP
|
||||
# Maximum number of redirects to follow on each get_response
|
||||
REDIRECT_LIMIT = 5
|
||||
|
||||
# CookieStore for this HTTP client
|
||||
attr_reader :cookie_store
|
||||
|
||||
def initialize(opts = {})
|
||||
@connections = {}
|
||||
@opts = opts
|
||||
@cookie_store = CookieStore.new(@opts[:cookies])
|
||||
end
|
||||
|
||||
#
|
||||
# Fetch a single Page from the response of an HTTP request to *url*.
|
||||
# Just gets the final destination page.
|
||||
#
|
||||
def fetch_page(url, referer = nil, depth = nil)
|
||||
fetch_pages(url, referer, depth).last
|
||||
end
|
||||
|
||||
#
|
||||
# Create new Pages from the response of an HTTP request to *url*,
|
||||
# including redirects
|
||||
#
|
||||
def fetch_pages(url, referer = nil, depth = nil)
|
||||
begin
|
||||
url = URI(url) unless url.is_a?(URI)
|
||||
pages = []
|
||||
get(url, referer) do |response, code, location, redirect_to, response_time|
|
||||
pages << Page.new(location, :body => response.body.dup,
|
||||
:code => code,
|
||||
:headers => response.to_hash,
|
||||
:referer => referer,
|
||||
:depth => depth,
|
||||
:redirect_to => redirect_to,
|
||||
:response_time => response_time)
|
||||
end
|
||||
|
||||
return pages
|
||||
rescue => e
|
||||
if verbose?
|
||||
puts e.inspect
|
||||
puts e.backtrace
|
||||
end
|
||||
return [Page.new(url, :error => e)]
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# The maximum number of redirects to follow
|
||||
#
|
||||
def redirect_limit
|
||||
@opts[:redirect_limit] || REDIRECT_LIMIT
|
||||
end
|
||||
|
||||
#
|
||||
# The user-agent string which will be sent with each request,
|
||||
# or nil if no such option is set
|
||||
#
|
||||
def user_agent
|
||||
@opts[:user_agent]
|
||||
end
|
||||
|
||||
#
|
||||
# Does this HTTP client accept cookies from the server?
|
||||
#
|
||||
def accept_cookies?
|
||||
@opts[:accept_cookies]
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
#
|
||||
# Retrieve HTTP responses for *url*, including redirects.
|
||||
# Yields the response object, response code, and URI location
|
||||
# for each response.
|
||||
#
|
||||
def get(url, referer = nil)
|
||||
limit = redirect_limit
|
||||
loc = url
|
||||
begin
|
||||
# if redirected to a relative url, merge it with the host of the original
|
||||
# request url
|
||||
loc = url.merge(loc) if loc.relative?
|
||||
|
||||
response, response_time = get_response(loc, referer)
|
||||
code = Integer(response.code)
|
||||
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
||||
yield response, code, loc, redirect_to, response_time
|
||||
limit -= 1
|
||||
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
||||
end
|
||||
|
||||
#
|
||||
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
||||
#
|
||||
def get_response(url, referer = nil)
|
||||
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
||||
|
||||
opts = {}
|
||||
opts['User-Agent'] = user_agent if user_agent
|
||||
opts['Referer'] = referer.to_s if referer
|
||||
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
|
||||
|
||||
retries = 0
|
||||
begin
|
||||
start = Time.now()
|
||||
response = connection(url).get(full_path, opts)
|
||||
finish = Time.now()
|
||||
response_time = ((finish - start) * 1000).round
|
||||
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
||||
return response, response_time
|
||||
rescue EOFError
|
||||
refresh_connection(url)
|
||||
retries += 1
|
||||
retry unless retries > 3
|
||||
end
|
||||
end
|
||||
|
||||
def connection(url)
|
||||
@connections[url.host] ||= {}
|
||||
|
||||
if conn = @connections[url.host][url.port]
|
||||
return conn
|
||||
end
|
||||
|
||||
refresh_connection url
|
||||
end
|
||||
|
||||
def refresh_connection(url)
|
||||
http = Net::HTTP.new(url.host, url.port)
|
||||
if url.scheme == 'https'
|
||||
http.use_ssl = true
|
||||
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
||||
end
|
||||
@connections[url.host][url.port] = http.start
|
||||
end
|
||||
|
||||
def verbose?
|
||||
@opts[:verbose]
|
||||
end
|
||||
|
||||
#
|
||||
# Allowed to connect to the requested url?
|
||||
#
|
||||
def allowed?(to_url, from_url)
|
||||
to_url.host.nil? || (to_url.host == from_url.host)
|
||||
end
|
||||
|
||||
end
|
||||
end
|
|
@ -0,0 +1,202 @@
|
|||
require 'nokogiri'
|
||||
require 'ostruct'
|
||||
require 'webrick/cookie'
|
||||
|
||||
module Anemone
|
||||
class Page
|
||||
|
||||
# The URL of the page
|
||||
attr_reader :url
|
||||
# The raw HTTP response body of the page
|
||||
attr_reader :body
|
||||
# Headers of the HTTP response
|
||||
attr_reader :headers
|
||||
# URL of the page this one redirected to, if any
|
||||
attr_reader :redirect_to
|
||||
# Exception object, if one was raised during HTTP#fetch_page
|
||||
attr_reader :error
|
||||
|
||||
# OpenStruct for user-stored data
|
||||
attr_accessor :data
|
||||
# Integer response code of the page
|
||||
attr_accessor :code
|
||||
# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
|
||||
attr_accessor :visited
|
||||
# Depth of this page from the root of the crawl. This is not necessarily the
|
||||
# shortest path; use PageStore#shortest_paths! to find that value.
|
||||
attr_accessor :depth
|
||||
# URL of the page that brought us to this page
|
||||
attr_accessor :referer
|
||||
# Response time of the request for this page in milliseconds
|
||||
attr_accessor :response_time
|
||||
|
||||
#
|
||||
# Create a new page
|
||||
#
|
||||
def initialize(url, params = {})
|
||||
@url = url
|
||||
@data = OpenStruct.new
|
||||
|
||||
@code = params[:code]
|
||||
@headers = params[:headers] || {}
|
||||
@headers['content-type'] ||= ['']
|
||||
@aliases = Array(params[:aka]).compact
|
||||
@referer = params[:referer]
|
||||
@depth = params[:depth] || 0
|
||||
@redirect_to = to_absolute(params[:redirect_to])
|
||||
@response_time = params[:response_time]
|
||||
@body = params[:body]
|
||||
@error = params[:error]
|
||||
|
||||
@fetched = !params[:code].nil?
|
||||
end
|
||||
|
||||
#
|
||||
# Array of distinct A tag HREFs from the page
|
||||
#
|
||||
def links
|
||||
return @links unless @links.nil?
|
||||
@links = []
|
||||
return @links if !doc
|
||||
|
||||
doc.search("//a[@href]").each do |a|
|
||||
u = a['href']
|
||||
next if u.nil? or u.empty?
|
||||
abs = to_absolute(URI(u)) rescue next
|
||||
@links << abs if in_domain?(abs)
|
||||
end
|
||||
@links.uniq!
|
||||
@links
|
||||
end
|
||||
|
||||
#
|
||||
# Nokogiri document for the HTML body
|
||||
#
|
||||
def doc
|
||||
return @doc if @doc
|
||||
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
||||
end
|
||||
|
||||
#
|
||||
# Delete the Nokogiri document and response body to conserve memory
|
||||
#
|
||||
def discard_doc!
|
||||
links # force parsing of page links before we trash the document
|
||||
@doc = @body = nil
|
||||
end
|
||||
|
||||
#
|
||||
# Was the page successfully fetched?
|
||||
# +true+ if the page was fetched with no error, +false+ otherwise.
|
||||
#
|
||||
def fetched?
|
||||
@fetched
|
||||
end
|
||||
|
||||
#
|
||||
# Array of cookies received with this page as WEBrick::Cookie objects.
|
||||
#
|
||||
def cookies
|
||||
WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue []
|
||||
end
|
||||
|
||||
#
|
||||
# The content-type returned by the HTTP request for this page
|
||||
#
|
||||
def content_type
|
||||
headers['content-type'].first
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if the page is a HTML document, returns +false+
|
||||
# otherwise.
|
||||
#
|
||||
def html?
|
||||
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
||||
# otherwise.
|
||||
#
|
||||
def redirect?
|
||||
(300..307).include?(@code)
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if the page was not found (returned 404 code),
|
||||
# returns +false+ otherwise.
|
||||
#
|
||||
def not_found?
|
||||
404 == @code
|
||||
end
|
||||
|
||||
#
|
||||
# Converts relative URL *link* into an absolute URL based on the
|
||||
# location of the page
|
||||
#
|
||||
def to_absolute(link)
|
||||
return nil if link.nil?
|
||||
|
||||
# remove anchor
|
||||
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
||||
|
||||
relative = URI(link)
|
||||
absolute = @url.merge(relative)
|
||||
|
||||
absolute.path = '/' if absolute.path.empty?
|
||||
|
||||
return absolute
|
||||
end
|
||||
|
||||
#
|
||||
# Returns +true+ if *uri* is in the same domain as the page, returns
|
||||
# +false+ otherwise
|
||||
#
|
||||
def in_domain?(uri)
|
||||
uri.host == @url.host
|
||||
end
|
||||
|
||||
def marshal_dump
|
||||
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
|
||||
end
|
||||
|
||||
def marshal_load(ary)
|
||||
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
|
||||
end
|
||||
|
||||
def to_hash
|
||||
{'url' => @url.to_s,
|
||||
'headers' => Marshal.dump(@headers),
|
||||
'data' => Marshal.dump(@data),
|
||||
'body' => @body,
|
||||
'links' => links.map(&:to_s),
|
||||
'code' => @code,
|
||||
'visited' => @visited,
|
||||
'depth' => @depth,
|
||||
'referer' => @referer.to_s,
|
||||
'redirect_to' => @redirect_to.to_s,
|
||||
'response_time' => @response_time,
|
||||
'fetched' => @fetched}
|
||||
end
|
||||
|
||||
def self.from_hash(hash)
|
||||
page = self.new(URI(hash['url']))
|
||||
{'@headers' => Marshal.load(hash['headers']),
|
||||
'@data' => Marshal.load(hash['data']),
|
||||
'@body' => hash['body'],
|
||||
'@links' => hash['links'].map { |link| URI(link) },
|
||||
'@code' => hash['code'].to_i,
|
||||
'@visited' => hash['visited'],
|
||||
'@depth' => hash['depth'].to_i,
|
||||
'@referer' => hash['referer'],
|
||||
'@redirect_to' => URI(hash['redirect_to']),
|
||||
'@response_time' => hash['response_time'].to_i,
|
||||
'@fetched' => hash['fetched']
|
||||
}.each do |var, value|
|
||||
page.instance_variable_set(var, value)
|
||||
end
|
||||
page
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,160 @@
|
|||
require 'forwardable'
|
||||
|
||||
module Anemone
|
||||
class PageStore
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@storage, :keys, :values, :size, :each
|
||||
|
||||
def initialize(storage = {})
|
||||
@storage = storage
|
||||
end
|
||||
|
||||
# We typically index the hash with a URI,
|
||||
# but convert it to a String for easier retrieval
|
||||
def [](index)
|
||||
@storage[index.to_s]
|
||||
end
|
||||
|
||||
def []=(index, other)
|
||||
@storage[index.to_s] = other
|
||||
end
|
||||
|
||||
def delete(key)
|
||||
@storage.delete key.to_s
|
||||
end
|
||||
|
||||
def has_key?(key)
|
||||
@storage.has_key? key.to_s
|
||||
end
|
||||
|
||||
def each_value
|
||||
each { |key, value| yield value }
|
||||
end
|
||||
|
||||
def values
|
||||
result = []
|
||||
each { |key, value| result << value }
|
||||
result
|
||||
end
|
||||
|
||||
def touch_key(key)
|
||||
self[key] = Page.new(key)
|
||||
end
|
||||
|
||||
def touch_keys(keys)
|
||||
@storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
|
||||
end
|
||||
|
||||
# Does this PageStore contain the specified URL?
|
||||
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
||||
def has_page?(url)
|
||||
schemes = %w(http https)
|
||||
if schemes.include? url.scheme
|
||||
u = url.dup
|
||||
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
||||
end
|
||||
|
||||
has_key? url
|
||||
end
|
||||
|
||||
#
|
||||
# Use a breadth-first search to calculate the single-source
|
||||
# shortest paths from *root* to all pages in the PageStore
|
||||
#
|
||||
def shortest_paths!(root)
|
||||
root = URI(root) if root.is_a?(String)
|
||||
raise "Root node not found" if !has_key?(root)
|
||||
|
||||
q = Queue.new
|
||||
|
||||
q.enq root
|
||||
root_page = self[root]
|
||||
root_page.depth = 0
|
||||
root_page.visited = true
|
||||
self[root] = root_page
|
||||
while !q.empty?
|
||||
page = self[q.deq]
|
||||
page.links.each do |u|
|
||||
begin
|
||||
link = self[u]
|
||||
next if link.nil? || !link.fetched? || link.visited
|
||||
|
||||
q << u unless link.redirect?
|
||||
link.visited = true
|
||||
link.depth = page.depth + 1
|
||||
self[u] = link
|
||||
|
||||
if link.redirect?
|
||||
u = link.redirect_to
|
||||
redo
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
#
|
||||
# Removes all Pages from storage where redirect? is true
|
||||
#
|
||||
def uniq!
|
||||
each_value { |page| delete page.url if page.redirect? }
|
||||
self
|
||||
end
|
||||
|
||||
#
|
||||
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
||||
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
||||
#
|
||||
def pages_linking_to(urls)
|
||||
unless urls.is_a?(Array)
|
||||
urls = [urls]
|
||||
single = true
|
||||
end
|
||||
|
||||
urls.map! do |url|
|
||||
unless url.is_a?(URI)
|
||||
URI(url) rescue nil
|
||||
else
|
||||
url
|
||||
end
|
||||
end
|
||||
urls.compact
|
||||
|
||||
links = {}
|
||||
urls.each { |url| links[url] = [] }
|
||||
values.each do |page|
|
||||
urls.each { |url| links[url] << page if page.links.include?(url) }
|
||||
end
|
||||
|
||||
if single and !links.empty?
|
||||
return links[urls.first]
|
||||
else
|
||||
return links
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
||||
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
||||
#
|
||||
def urls_linking_to(urls)
|
||||
unless urls.is_a?(Array)
|
||||
urls = [urls] unless urls.is_a?(Array)
|
||||
single = true
|
||||
end
|
||||
|
||||
links = pages_linking_to(urls)
|
||||
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
||||
|
||||
if single and !links.empty?
|
||||
return links[urls.first]
|
||||
else
|
||||
return links
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
|
@ -0,0 +1,34 @@
|
|||
module Anemone
|
||||
module Storage
|
||||
|
||||
def self.Hash(*args)
|
||||
hash = Hash.new(*args)
|
||||
# add close method for compatibility with Storage::Base
|
||||
class << hash; def close; end; end
|
||||
hash
|
||||
end
|
||||
|
||||
def self.PStore(*args)
|
||||
require 'anemone/storage/pstore'
|
||||
self::PStore.new(*args)
|
||||
end
|
||||
|
||||
def self.TokyoCabinet(file = 'anemone.tch')
|
||||
require 'anemone/storage/tokyo_cabinet'
|
||||
self::TokyoCabinet.new(file)
|
||||
end
|
||||
|
||||
def self.MongoDB(mongo_db = nil, collection_name = 'pages')
|
||||
require 'anemone/storage/mongodb'
|
||||
mongo_db ||= Mongo::Connection.new.db('anemone')
|
||||
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
|
||||
self::MongoDB.new(mongo_db, collection_name)
|
||||
end
|
||||
|
||||
def self.Redis(opts = {})
|
||||
require 'anemone/storage/redis'
|
||||
self::Redis.new(opts)
|
||||
end
|
||||
|
||||
end
|
||||
end
|
|
@ -0,0 +1,75 @@
|
|||
require 'anemone/storage/exceptions'
|
||||
|
||||
module Anemone
|
||||
module Storage
|
||||
class Base
|
||||
|
||||
def initialize(adapter)
|
||||
@adap = adapter
|
||||
|
||||
# verify adapter conforms to this class's methods
|
||||
methods.each do |method|
|
||||
if !@adap.respond_to?(method.to_sym)
|
||||
raise "Storage adapter must support method #{method}"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def [](key)
|
||||
@adap[key]
|
||||
rescue
|
||||
puts key
|
||||
raise RetrievalError, $!
|
||||
end
|
||||
|
||||
def []=(key, value)
|
||||
@adap[key] = value
|
||||
rescue
|
||||
raise InsertionError, $!
|
||||
end
|
||||
|
||||
def delete(key)
|
||||
@adap.delete(key)
|
||||
rescue
|
||||
raise DeletionError, $!
|
||||
end
|
||||
|
||||
def each
|
||||
@adap.each { |k, v| yield k, v }
|
||||
rescue
|
||||
raise GenericError, $!
|
||||
end
|
||||
|
||||
def merge!(hash)
|
||||
@adap.merge!(hash)
|
||||
rescue
|
||||
raise GenericError, $!
|
||||
end
|
||||
|
||||
def close
|
||||
@adap.close
|
||||
rescue
|
||||
raise CloseError, $!
|
||||
end
|
||||
|
||||
def size
|
||||
@adap.size
|
||||
rescue
|
||||
raise GenericError, $!
|
||||
end
|
||||
|
||||
def keys
|
||||
@adap.keys
|
||||
rescue
|
||||
raise GenericError, $!
|
||||
end
|
||||
|
||||
def has_key?(key)
|
||||
@adap.has_key?(key)
|
||||
rescue
|
||||
raise GenericError, $!
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,15 @@
|
|||
module Anemone
|
||||
module Storage
|
||||
|
||||
class GenericError < Error; end;
|
||||
|
||||
class ConnectionError < Error; end
|
||||
|
||||
class RetrievalError < Error; end
|
||||
|
||||
class InsertionError < Error; end
|
||||
|
||||
class CloseError < Error; end
|
||||
|
||||
end
|
||||
end
|
|
@ -0,0 +1,89 @@
|
|||
begin
|
||||
require 'mongo'
|
||||
rescue LoadError
|
||||
puts "You need the mongo gem to use Anemone::Storage::MongoDB"
|
||||
exit
|
||||
end
|
||||
|
||||
module Anemone
|
||||
module Storage
|
||||
class MongoDB
|
||||
|
||||
BINARY_FIELDS = %w(body headers data)
|
||||
|
||||
def initialize(mongo_db, collection_name)
|
||||
@db = mongo_db
|
||||
@collection = @db[collection_name]
|
||||
@collection.remove
|
||||
@collection.create_index 'url'
|
||||
end
|
||||
|
||||
def [](url)
|
||||
if value = @collection.find_one('url' => url.to_s)
|
||||
load_page(value)
|
||||
end
|
||||
end
|
||||
|
||||
def []=(url, page)
|
||||
hash = page.to_hash
|
||||
BINARY_FIELDS.each do |field|
|
||||
hash[field] = BSON::Binary.new(hash[field]) unless hash[field].nil?
|
||||
end
|
||||
@collection.update(
|
||||
{'url' => page.url.to_s},
|
||||
hash,
|
||||
:upsert => true
|
||||
)
|
||||
end
|
||||
|
||||
def delete(url)
|
||||
page = self[url]
|
||||
@collection.remove('url' => url.to_s)
|
||||
page
|
||||
end
|
||||
|
||||
def each
|
||||
@collection.find do |cursor|
|
||||
cursor.each do |doc|
|
||||
page = load_page(doc)
|
||||
yield page.url.to_s, page
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def merge!(hash)
|
||||
hash.each { |key, value| self[key] = value }
|
||||
self
|
||||
end
|
||||
|
||||
def size
|
||||
@collection.count
|
||||
end
|
||||
|
||||
def keys
|
||||
keys = []
|
||||
self.each { |k, v| keys << k.to_s }
|
||||
keys
|
||||
end
|
||||
|
||||
def has_key?(url)
|
||||
!!@collection.find_one('url' => url.to_s)
|
||||
end
|
||||
|
||||
def close
|
||||
@db.connection.close
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def load_page(hash)
|
||||
BINARY_FIELDS.each do |field|
|
||||
hash[field] = hash[field].to_s
|
||||
end
|
||||
Page.from_hash(hash)
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
require 'pstore'
|
||||
require 'forwardable'
|
||||
|
||||
module Anemone
|
||||
module Storage
|
||||
class PStore
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@keys, :has_key?, :keys, :size
|
||||
|
||||
def initialize(file)
|
||||
File.delete(file) if File.exists?(file)
|
||||
@store = ::PStore.new(file)
|
||||
@keys = {}
|
||||
end
|
||||
|
||||
def [](key)
|
||||
@store.transaction { |s| s[key] }
|
||||
end
|
||||
|
||||
def []=(key,value)
|
||||
@keys[key] = nil
|
||||
@store.transaction { |s| s[key] = value }
|
||||
end
|
||||
|
||||
def delete(key)
|
||||
@keys.delete(key)
|
||||
@store.transaction { |s| s.delete key}
|
||||
end
|
||||
|
||||
def each
|
||||
@keys.each_key do |key|
|
||||
value = nil
|
||||
@store.transaction { |s| value = s[key] }
|
||||
yield key, value
|
||||
end
|
||||
end
|
||||
|
||||
def merge!(hash)
|
||||
@store.transaction do |s|
|
||||
hash.each { |key, value| s[key] = value; @keys[key] = nil }
|
||||
end
|
||||
self
|
||||
end
|
||||
|
||||
def close; end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,90 @@
|
|||
require 'redis'
|
||||
|
||||
module Anemone
|
||||
module Storage
|
||||
class Redis
|
||||
|
||||
MARSHAL_FIELDS = %w(links visited fetched)
|
||||
|
||||
def initialize(opts = {})
|
||||
@redis = ::Redis.new(opts)
|
||||
@key_prefix = opts[:key_prefix] || 'anemone'
|
||||
keys.each { |key| delete(key) }
|
||||
end
|
||||
|
||||
def [](key)
|
||||
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
||||
rget(rkey)
|
||||
end
|
||||
|
||||
def []=(key, value)
|
||||
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
||||
hash = value.to_hash
|
||||
MARSHAL_FIELDS.each do |field|
|
||||
hash[field] = Marshal.dump(hash[field])
|
||||
end
|
||||
hash.each do |field, value|
|
||||
@redis.hset(rkey, field, value)
|
||||
end
|
||||
end
|
||||
|
||||
def delete(key)
|
||||
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
||||
page = self[key]
|
||||
@redis.del(rkey)
|
||||
page
|
||||
end
|
||||
|
||||
def each
|
||||
rkeys = @redis.keys("#{@key_prefix}:pages:*")
|
||||
rkeys.each do |rkey|
|
||||
page = rget(rkey)
|
||||
yield page.url.to_s, page
|
||||
end
|
||||
end
|
||||
|
||||
def merge!(hash)
|
||||
hash.each { |key, value| self[key] = value }
|
||||
self
|
||||
end
|
||||
|
||||
def size
|
||||
@redis.keys("#{@key_prefix}:pages:*").size
|
||||
end
|
||||
|
||||
def keys
|
||||
keys = []
|
||||
self.each { |k, v| keys << k.to_s }
|
||||
keys
|
||||
end
|
||||
|
||||
def has_key?(key)
|
||||
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
||||
@redis.exists(rkey)
|
||||
end
|
||||
|
||||
def close
|
||||
@redis.quit
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def load_value(hash)
|
||||
MARSHAL_FIELDS.each do |field|
|
||||
unless hash[field].nil? || hash[field] == ''
|
||||
hash[field] = Marshal.load(hash[field])
|
||||
end
|
||||
end
|
||||
Page.from_hash(hash)
|
||||
end
|
||||
|
||||
def rget(rkey)
|
||||
hash = @redis.hgetall(rkey)
|
||||
if !!hash
|
||||
load_value(hash)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,57 @@
|
|||
begin
|
||||
require 'tokyocabinet'
|
||||
rescue LoadError
|
||||
puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
|
||||
exit
|
||||
end
|
||||
|
||||
require 'forwardable'
|
||||
|
||||
module Anemone
|
||||
module Storage
|
||||
class TokyoCabinet
|
||||
extend Forwardable
|
||||
|
||||
def_delegators :@db, :close, :size, :keys, :has_key?
|
||||
|
||||
def initialize(file)
|
||||
raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
|
||||
@db = ::TokyoCabinet::HDB::new
|
||||
@db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
|
||||
@db.clear
|
||||
end
|
||||
|
||||
def [](key)
|
||||
if value = @db[key]
|
||||
load_value(value)
|
||||
end
|
||||
end
|
||||
|
||||
def []=(key, value)
|
||||
@db[key] = [Marshal.dump(value)].pack("m")
|
||||
end
|
||||
|
||||
def delete(key)
|
||||
value = self[key]
|
||||
@db.delete(key)
|
||||
value
|
||||
end
|
||||
|
||||
def each
|
||||
@db.each { |k, v| yield k, load_value(v) }
|
||||
end
|
||||
|
||||
def merge!(hash)
|
||||
hash.each { |key, value| self[key] = value }
|
||||
self
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def load_value(value)
|
||||
Marshal.load(value.unpack("m")[0])
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,39 @@
|
|||
require 'anemone/http'
|
||||
|
||||
module Anemone
|
||||
class Tentacle
|
||||
|
||||
#
|
||||
# Create a new Tentacle
|
||||
#
|
||||
def initialize(link_queue, page_queue, opts = {})
|
||||
@link_queue = link_queue
|
||||
@page_queue = page_queue
|
||||
@http = Anemone::HTTP.new(opts)
|
||||
@opts = opts
|
||||
end
|
||||
|
||||
#
|
||||
# Gets links from @link_queue, and returns the fetched
|
||||
# Page objects into @page_queue
|
||||
#
|
||||
def run
|
||||
loop do
|
||||
link, referer, depth = @link_queue.deq
|
||||
|
||||
break if link == :END
|
||||
|
||||
@http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
|
||||
|
||||
delay
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def delay
|
||||
sleep @opts[:delay] if @opts[:delay] > 0
|
||||
end
|
||||
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue