161 lines
3.6 KiB
Ruby
161 lines
3.6 KiB
Ruby
|
require 'forwardable'
|
||
|
|
||
|
module Anemone
|
||
|
class PageStore
|
||
|
extend Forwardable
|
||
|
|
||
|
def_delegators :@storage, :keys, :values, :size, :each
|
||
|
|
||
|
def initialize(storage = {})
|
||
|
@storage = storage
|
||
|
end
|
||
|
|
||
|
# We typically index the hash with a URI,
|
||
|
# but convert it to a String for easier retrieval
|
||
|
def [](index)
|
||
|
@storage[index.to_s]
|
||
|
end
|
||
|
|
||
|
def []=(index, other)
|
||
|
@storage[index.to_s] = other
|
||
|
end
|
||
|
|
||
|
def delete(key)
|
||
|
@storage.delete key.to_s
|
||
|
end
|
||
|
|
||
|
def has_key?(key)
|
||
|
@storage.has_key? key.to_s
|
||
|
end
|
||
|
|
||
|
def each_value
|
||
|
each { |key, value| yield value }
|
||
|
end
|
||
|
|
||
|
def values
|
||
|
result = []
|
||
|
each { |key, value| result << value }
|
||
|
result
|
||
|
end
|
||
|
|
||
|
def touch_key(key)
|
||
|
self[key] = Page.new(key)
|
||
|
end
|
||
|
|
||
|
def touch_keys(keys)
|
||
|
@storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
|
||
|
end
|
||
|
|
||
|
# Does this PageStore contain the specified URL?
|
||
|
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
||
|
def has_page?(url)
|
||
|
schemes = %w(http https)
|
||
|
if schemes.include? url.scheme
|
||
|
u = url.dup
|
||
|
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
||
|
end
|
||
|
|
||
|
has_key? url
|
||
|
end
|
||
|
|
||
|
#
|
||
|
# Use a breadth-first search to calculate the single-source
|
||
|
# shortest paths from *root* to all pages in the PageStore
|
||
|
#
|
||
|
def shortest_paths!(root)
|
||
|
root = URI(root) if root.is_a?(String)
|
||
|
raise "Root node not found" if !has_key?(root)
|
||
|
|
||
|
q = Queue.new
|
||
|
|
||
|
q.enq root
|
||
|
root_page = self[root]
|
||
|
root_page.depth = 0
|
||
|
root_page.visited = true
|
||
|
self[root] = root_page
|
||
|
while !q.empty?
|
||
|
page = self[q.deq]
|
||
|
page.links.each do |u|
|
||
|
begin
|
||
|
link = self[u]
|
||
|
next if link.nil? || !link.fetched? || link.visited
|
||
|
|
||
|
q << u unless link.redirect?
|
||
|
link.visited = true
|
||
|
link.depth = page.depth + 1
|
||
|
self[u] = link
|
||
|
|
||
|
if link.redirect?
|
||
|
u = link.redirect_to
|
||
|
redo
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
self
|
||
|
end
|
||
|
|
||
|
#
|
||
|
# Removes all Pages from storage where redirect? is true
|
||
|
#
|
||
|
def uniq!
|
||
|
each_value { |page| delete page.url if page.redirect? }
|
||
|
self
|
||
|
end
|
||
|
|
||
|
#
|
||
|
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
||
|
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
||
|
#
|
||
|
def pages_linking_to(urls)
|
||
|
unless urls.is_a?(Array)
|
||
|
urls = [urls]
|
||
|
single = true
|
||
|
end
|
||
|
|
||
|
urls.map! do |url|
|
||
|
unless url.is_a?(URI)
|
||
|
URI(url) rescue nil
|
||
|
else
|
||
|
url
|
||
|
end
|
||
|
end
|
||
|
urls.compact
|
||
|
|
||
|
links = {}
|
||
|
urls.each { |url| links[url] = [] }
|
||
|
values.each do |page|
|
||
|
urls.each { |url| links[url] << page if page.links.include?(url) }
|
||
|
end
|
||
|
|
||
|
if single and !links.empty?
|
||
|
return links[urls.first]
|
||
|
else
|
||
|
return links
|
||
|
end
|
||
|
end
|
||
|
|
||
|
#
|
||
|
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
||
|
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
||
|
#
|
||
|
def urls_linking_to(urls)
|
||
|
unless urls.is_a?(Array)
|
||
|
urls = [urls] unless urls.is_a?(Array)
|
||
|
single = true
|
||
|
end
|
||
|
|
||
|
links = pages_linking_to(urls)
|
||
|
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
||
|
|
||
|
if single and !links.empty?
|
||
|
return links[urls.first]
|
||
|
else
|
||
|
return links
|
||
|
end
|
||
|
end
|
||
|
|
||
|
end
|
||
|
end
|