add scraper selector

pull/43/head
eric 2017-09-27 19:20:14 -04:00
parent db03b59fb4
commit 467ab8a425
2 changed files with 19 additions and 5 deletions

View File

@ -38,7 +38,7 @@ from . import cc
from . import models
from .parameters import WORK_IDENTIFIERS
from .validation import identifier_cleaner
from .loaders.scrape import BaseScraper, scrape_sitemap
from .loaders.scrape import get_scraper, scrape_sitemap
logger = logging.getLogger(__name__)
request_log = logging.getLogger("requests")
@ -1023,7 +1023,7 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
def add_by_webpage(url, work=None, user=None):
edition = None
scraper = BaseScraper(url)
scraper = get_scraper(url)
loader = BasePandataLoader(url)
pandata = Pandata()
pandata.metadata = scraper.metadata
@ -1035,7 +1035,6 @@ def add_by_webpage(url, work=None, user=None):
def add_by_sitemap(url, maxnum=None):
editions = []
scraper = BaseScraper(url)
for bookdata in scrape_sitemap(url, maxnum=maxnum):
edition = work = None
loader = BasePandataLoader(bookdata.base)

View File

@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
CONTAINS_COVER = re.compile('cover')
CONTAINS_CC = re.compile('creativecommons.org')
class BaseScraper(object):
class BaseScraper(object):
'''
designed to make at least a decent gues for webpages that embed metadata
'''
@ -219,6 +219,11 @@ class BaseScraper(object):
for link in links:
self.set('rights_url', link['href'])
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return True
class PressbooksScraper(BaseScraper):
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
@ -244,13 +249,23 @@ class PressbooksScraper(BaseScraper):
else:
super(PressbooksScraper, self).get_title()
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0
def get_scraper(url):
scrapers = [PressbooksScraper, BaseScraper]
for scraper in scrapers:
if scraper.can_scrape(url):
return scraper(url)
def scrape_sitemap(url, maxnum=None):
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
doc = BeautifulSoup(response.content, 'lxml')
for page in doc.find_all('loc')[0:maxnum]:
scraper = BaseScraper(page.text)
scraper = get_scraper(page.text)
if scraper.metadata.get('genre', None) == 'book':
yield scraper
except requests.exceptions.RequestException as e: