From 467ab8a4257bc0fbb820a77757ae3fbe46af760f Mon Sep 17 00:00:00 2001 From: eric Date: Wed, 27 Sep 2017 19:20:14 -0400 Subject: [PATCH] add scraper selector --- core/bookloader.py | 5 ++--- core/loaders/scrape.py | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/core/bookloader.py b/core/bookloader.py index 68a80ed0..0326a8e3 100755 --- a/core/bookloader.py +++ b/core/bookloader.py @@ -38,7 +38,7 @@ from . import cc from . import models from .parameters import WORK_IDENTIFIERS from .validation import identifier_cleaner -from .loaders.scrape import BaseScraper, scrape_sitemap +from .loaders.scrape import get_scraper, scrape_sitemap logger = logging.getLogger(__name__) request_log = logging.getLogger("requests") @@ -1023,7 +1023,7 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None): def add_by_webpage(url, work=None, user=None): edition = None - scraper = BaseScraper(url) + scraper = get_scraper(url) loader = BasePandataLoader(url) pandata = Pandata() pandata.metadata = scraper.metadata @@ -1035,7 +1035,6 @@ def add_by_webpage(url, work=None, user=None): def add_by_sitemap(url, maxnum=None): editions = [] - scraper = BaseScraper(url) for bookdata in scrape_sitemap(url, maxnum=maxnum): edition = work = None loader = BasePandataLoader(bookdata.base) diff --git a/core/loaders/scrape.py b/core/loaders/scrape.py index 1978a45c..9656f7ea 100644 --- a/core/loaders/scrape.py +++ b/core/loaders/scrape.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) CONTAINS_COVER = re.compile('cover') CONTAINS_CC = re.compile('creativecommons.org') -class BaseScraper(object): +class BaseScraper(object): ''' designed to make at least a decent gues for webpages that embed metadata ''' @@ -219,6 +219,11 @@ class BaseScraper(object): for link in links: self.set('rights_url', link['href']) + @classmethod + def can_scrape(cls, url): + ''' return True if the class can scrape the URL ''' + return True + class PressbooksScraper(BaseScraper): def get_downloads(self): for dl_type in ['epub', 'mobi', 'pdf']: @@ -244,13 +249,23 @@ class PressbooksScraper(BaseScraper): else: super(PressbooksScraper, self).get_title() + @classmethod + def can_scrape(cls, url): + ''' return True if the class can scrape the URL ''' + return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0 +def get_scraper(url): + scrapers = [PressbooksScraper, BaseScraper] + for scraper in scrapers: + if scraper.can_scrape(url): + return scraper(url) + def scrape_sitemap(url, maxnum=None): try: response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}) doc = BeautifulSoup(response.content, 'lxml') for page in doc.find_all('loc')[0:maxnum]: - scraper = BaseScraper(page.text) + scraper = get_scraper(page.text) if scraper.metadata.get('genre', None) == 'book': yield scraper except requests.exceptions.RequestException as e: