add scraper selector
parent
db03b59fb4
commit
467ab8a425
|
@ -38,7 +38,7 @@ from . import cc
|
|||
from . import models
|
||||
from .parameters import WORK_IDENTIFIERS
|
||||
from .validation import identifier_cleaner
|
||||
from .loaders.scrape import BaseScraper, scrape_sitemap
|
||||
from .loaders.scrape import get_scraper, scrape_sitemap
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
request_log = logging.getLogger("requests")
|
||||
|
@ -1023,7 +1023,7 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
|
|||
|
||||
def add_by_webpage(url, work=None, user=None):
|
||||
edition = None
|
||||
scraper = BaseScraper(url)
|
||||
scraper = get_scraper(url)
|
||||
loader = BasePandataLoader(url)
|
||||
pandata = Pandata()
|
||||
pandata.metadata = scraper.metadata
|
||||
|
@ -1035,7 +1035,6 @@ def add_by_webpage(url, work=None, user=None):
|
|||
|
||||
def add_by_sitemap(url, maxnum=None):
|
||||
editions = []
|
||||
scraper = BaseScraper(url)
|
||||
for bookdata in scrape_sitemap(url, maxnum=maxnum):
|
||||
edition = work = None
|
||||
loader = BasePandataLoader(bookdata.base)
|
||||
|
|
|
@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
|
|||
CONTAINS_COVER = re.compile('cover')
|
||||
CONTAINS_CC = re.compile('creativecommons.org')
|
||||
|
||||
class BaseScraper(object):
|
||||
class BaseScraper(object):
|
||||
'''
|
||||
designed to make at least a decent gues for webpages that embed metadata
|
||||
'''
|
||||
|
@ -219,6 +219,11 @@ class BaseScraper(object):
|
|||
for link in links:
|
||||
self.set('rights_url', link['href'])
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return True
|
||||
|
||||
class PressbooksScraper(BaseScraper):
|
||||
def get_downloads(self):
|
||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||
|
@ -244,13 +249,23 @@ class PressbooksScraper(BaseScraper):
|
|||
else:
|
||||
super(PressbooksScraper, self).get_title()
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0
|
||||
|
||||
def get_scraper(url):
|
||||
scrapers = [PressbooksScraper, BaseScraper]
|
||||
for scraper in scrapers:
|
||||
if scraper.can_scrape(url):
|
||||
return scraper(url)
|
||||
|
||||
def scrape_sitemap(url, maxnum=None):
|
||||
try:
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
doc = BeautifulSoup(response.content, 'lxml')
|
||||
for page in doc.find_all('loc')[0:maxnum]:
|
||||
scraper = BaseScraper(page.text)
|
||||
scraper = get_scraper(page.text)
|
||||
if scraper.metadata.get('genre', None) == 'book':
|
||||
yield scraper
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
|
Loading…
Reference in New Issue