2010-11-06 04:34:43 +00:00
|
|
|
require 'net/https'
|
|
|
|
require 'anemone/page'
|
|
|
|
require 'anemone/cookie_store'
|
|
|
|
|
|
|
|
module Anemone
|
|
|
|
class HTTP
|
|
|
|
# Maximum number of redirects to follow on each get_response
|
|
|
|
REDIRECT_LIMIT = 5
|
|
|
|
|
|
|
|
# CookieStore for this HTTP client
|
|
|
|
attr_reader :cookie_store
|
|
|
|
|
|
|
|
def initialize(opts = {})
|
|
|
|
@connections = {}
|
|
|
|
@opts = opts
|
|
|
|
@cookie_store = CookieStore.new(@opts[:cookies])
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# Fetch a single Page from the response of an HTTP request to *url*.
|
|
|
|
# Just gets the final destination page.
|
|
|
|
#
|
|
|
|
def fetch_page(url, referer = nil, depth = nil)
|
|
|
|
fetch_pages(url, referer, depth).last
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# Create new Pages from the response of an HTTP request to *url*,
|
|
|
|
# including redirects
|
|
|
|
#
|
|
|
|
def fetch_pages(url, referer = nil, depth = nil)
|
|
|
|
begin
|
|
|
|
url = URI(url) unless url.is_a?(URI)
|
|
|
|
pages = []
|
|
|
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
|
|
|
pages << Page.new(location, :body => response.body.dup,
|
|
|
|
:code => code,
|
|
|
|
:headers => response.to_hash,
|
|
|
|
:referer => referer,
|
|
|
|
:depth => depth,
|
|
|
|
:redirect_to => redirect_to,
|
|
|
|
:response_time => response_time)
|
|
|
|
end
|
|
|
|
|
|
|
|
return pages
|
|
|
|
rescue => e
|
|
|
|
if verbose?
|
|
|
|
puts e.inspect
|
|
|
|
puts e.backtrace
|
|
|
|
end
|
|
|
|
return [Page.new(url, :error => e)]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# The maximum number of redirects to follow
|
|
|
|
#
|
|
|
|
def redirect_limit
|
|
|
|
@opts[:redirect_limit] || REDIRECT_LIMIT
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# The user-agent string which will be sent with each request,
|
|
|
|
# or nil if no such option is set
|
|
|
|
#
|
|
|
|
def user_agent
|
|
|
|
@opts[:user_agent]
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# Does this HTTP client accept cookies from the server?
|
|
|
|
#
|
|
|
|
def accept_cookies?
|
|
|
|
@opts[:accept_cookies]
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
#
|
|
|
|
# Retrieve HTTP responses for *url*, including redirects.
|
|
|
|
# Yields the response object, response code, and URI location
|
|
|
|
# for each response.
|
|
|
|
#
|
|
|
|
def get(url, referer = nil)
|
|
|
|
limit = redirect_limit
|
|
|
|
loc = url
|
|
|
|
begin
|
|
|
|
# if redirected to a relative url, merge it with the host of the original
|
|
|
|
# request url
|
|
|
|
loc = url.merge(loc) if loc.relative?
|
|
|
|
|
|
|
|
response, response_time = get_response(loc, referer)
|
|
|
|
code = Integer(response.code)
|
|
|
|
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
|
|
|
yield response, code, loc, redirect_to, response_time
|
|
|
|
limit -= 1
|
|
|
|
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
|
|
|
#
|
2010-11-06 04:45:27 +00:00
|
|
|
# MODIFIED: Change get_response to allow fine tuning of the HTTP request before
|
|
|
|
# it is sent to the remote system.
|
|
|
|
#
|
2010-11-06 04:34:43 +00:00
|
|
|
def get_response(url, referer = nil)
|
|
|
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
|
|
|
|
|
|
|
opts = {}
|
|
|
|
opts['User-Agent'] = user_agent if user_agent
|
|
|
|
opts['Referer'] = referer.to_s if referer
|
|
|
|
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
|
2011-07-16 13:29:28 +00:00
|
|
|
|
|
|
|
if @opts[:http_basic_auth]
|
|
|
|
opts['Authorization'] = "Basic " + @opts[:http_basic_auth]
|
|
|
|
end
|
2010-11-06 04:34:43 +00:00
|
|
|
|
2011-11-06 20:40:20 +00:00
|
|
|
if not @opts[:inject_headers].nil?
|
|
|
|
@opts[:inject_headers].each do |hdr|
|
|
|
|
k,v = hdr.split(':', 2)
|
|
|
|
opts[k] = v
|
|
|
|
end
|
2011-07-16 13:29:28 +00:00
|
|
|
end
|
|
|
|
|
2010-11-06 04:34:43 +00:00
|
|
|
retries = 0
|
|
|
|
begin
|
|
|
|
start = Time.now()
|
2010-11-06 04:45:27 +00:00
|
|
|
response = nil
|
|
|
|
if @opts[:request_factory]
|
|
|
|
response = @opts[:request_factory].call(connection(url), full_path, opts)
|
|
|
|
else
|
|
|
|
response = connection(url).get(full_path, opts)
|
|
|
|
end
|
2010-11-06 04:34:43 +00:00
|
|
|
finish = Time.now()
|
|
|
|
response_time = ((finish - start) * 1000).round
|
|
|
|
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
|
|
|
return response, response_time
|
|
|
|
rescue EOFError
|
|
|
|
refresh_connection(url)
|
|
|
|
retries += 1
|
2010-11-06 04:45:27 +00:00
|
|
|
retry unless retries > (@opts[:retry_limit] || 3)
|
2010-11-06 04:34:43 +00:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def connection(url)
|
|
|
|
@connections[url.host] ||= {}
|
|
|
|
|
|
|
|
if conn = @connections[url.host][url.port]
|
|
|
|
return conn
|
|
|
|
end
|
|
|
|
|
|
|
|
refresh_connection url
|
|
|
|
end
|
|
|
|
|
2010-11-06 04:45:27 +00:00
|
|
|
#
|
|
|
|
# MODIFIED: Change refresh_connection to allow a HTTP factory to be used to
|
|
|
|
# create the Net::HTTP object. This allows much more granular
|
|
|
|
# control over the requests.
|
|
|
|
#
|
2010-11-06 04:34:43 +00:00
|
|
|
def refresh_connection(url)
|
2010-11-06 04:45:27 +00:00
|
|
|
http = nil
|
|
|
|
if @opts[:http_factory]
|
|
|
|
http = @opts[:http_factory].call(url)
|
|
|
|
else
|
|
|
|
http = Net::HTTP.new(url.host, url.port)
|
|
|
|
if url.scheme == 'https'
|
|
|
|
http.use_ssl = true
|
|
|
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
|
|
end
|
2010-11-06 04:34:43 +00:00
|
|
|
end
|
|
|
|
@connections[url.host][url.port] = http.start
|
|
|
|
end
|
|
|
|
|
|
|
|
def verbose?
|
|
|
|
@opts[:verbose]
|
|
|
|
end
|
|
|
|
|
|
|
|
#
|
|
|
|
# Allowed to connect to the requested url?
|
|
|
|
#
|
|
|
|
def allowed?(to_url, from_url)
|
|
|
|
to_url.host.nil? || (to_url.host == from_url.host)
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
end
|