scrape multiple books from one url

pull/95/head
eric 2018-07-09 15:46:36 -04:00
parent 1389f87616
commit 2f532b97f9
4 changed files with 123 additions and 13 deletions

View File

@ -124,6 +124,9 @@ def online_to_download(url):
booknum = FRONTIERSIN.search(url).group(1) booknum = FRONTIERSIN.search(url).group(1)
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum)) urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum))
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum)) urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum))
elif url.find(u'edp-open.org/books-in-') >= 0:
# pages needing multi-scrape
return urls
else: else:
urls.append(url) urls.append(url)
return urls return urls

View File

@ -0,0 +1,94 @@
import logging
import re
from urlparse import urljoin
from bs4 import BeautifulSoup
import requests
from django.conf import settings
from regluit.core.bookloader import add_from_bookdatas
from regluit.core.loaders.scrape import BaseScraper
from regluit.core.validation import identifier_cleaner
logger = logging.getLogger(__name__)
'''
use for web pages with multiple books
returns an iterator of scrapers
'''
class BaseMultiScraper(BaseScraper):
def __init__(self, url, doc):
self.metadata = {}
self.identifiers = {'http': url}
self.doc = doc
self.base = url
self.get_all()
if not self.metadata.get('title', None):
self.set('title', '!!! missing title !!!')
if not self.metadata.get('language', None):
self.set('language', 'en')
self.metadata['identifiers'] = self.identifiers
def multiscrape(url, divider, scraper_class=BaseMultiScraper):
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
doc = BeautifulSoup(response.content, 'lxml')
sections = divider(doc)
for section in sections:
yield scraper_class(url, section)
except requests.exceptions.RequestException as e:
logger.error(e)
self.metadata = None
# following is code specific to edp-open.org; refactor when we add another
def divider(doc):
return doc.select('article.Bk')
ISBNMATCH = re.compile(r'([\d\-]+)')
class EDPMultiScraper(BaseMultiScraper):
def get_isbns(self):
'''return a dict of edition keys and ISBNs'''
isbns = {}
isbn_cleaner = identifier_cleaner('isbn', quiet=True)
labels = ['epub', 'pdf', 'paper']
info = self.doc.select_one('p.nfo').text
isbntexts = re.split('ISBN', info)
for isbntext in isbntexts[1:]:
isbnmatch = ISBNMATCH.search(isbntext)
if isbnmatch:
isbn = isbn_cleaner(isbnmatch.group(0))
isbns[labels.pop()] = isbn
return isbns
def get_downloads(self):
dl = self.doc.select_one('nav.dl')
links = dl.select('a.fulldl')
for link in links:
href = urljoin(self.base, link['href'])
if href.endswith('.pdf'):
self.set('download_url_pdf', href)
elif href.endswith('.epub'):
self.set('download_url_epub', href)
def get_language(self):
self.set('language', 'fr')
def get_title(self):
value = self.doc.select_one('h2').text
book_id = self.doc.select_one('h2')['id']
self.identifiers['http'] = u'{}#{}'.format(self.base, book_id)
self.set('title', value)
def edp_scrape():
edp_urls = [
'https://www.edp-open.org/books-in-french',
'https://www.edp-open.org/books-in-english',
]
for url in edp_urls:
scrapers = multiscrape(url, divider, scraper_class=EDPMultiScraper)
add_from_bookdatas(scrapers)

View File

@ -51,19 +51,7 @@ class BaseScraper(object):
self.doc = BeautifulSoup(response.content, 'lxml') self.doc = BeautifulSoup(response.content, 'lxml')
for review in self.doc.find_all(itemtype="http://schema.org/Review"): for review in self.doc.find_all(itemtype="http://schema.org/Review"):
review.clear() review.clear()
self.setup() self.get_all()
self.get_genre()
self.get_title()
self.get_language()
self.get_description()
self.get_identifiers()
self.get_keywords()
self.get_publisher()
self.get_pubdate()
self.get_authors()
self.get_cover()
self.get_downloads()
self.get_license()
if not self.metadata.get('title', None): if not self.metadata.get('title', None):
self.set('title', '!!! missing title !!!') self.set('title', '!!! missing title !!!')
if not self.metadata.get('language', None): if not self.metadata.get('language', None):
@ -141,6 +129,21 @@ class BaseScraper(object):
value_list.append(el['content']) value_list.append(el['content'])
return value_list return value_list
def get_all(self):
self.setup()
self.get_genre()
self.get_title()
self.get_language()
self.get_description()
self.get_identifiers()
self.get_keywords()
self.get_publisher()
self.get_pubdate()
self.get_authors()
self.get_cover()
self.get_downloads()
self.get_license()
def setup(self): def setup(self):
# use this method to get auxiliary resources based on doc # use this method to get auxiliary resources based on doc
pass pass

View File

@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders.multiscrape import edp_scrape
class Command(BaseCommand):
help = "load books from edp-open"
def handle(self, **options):
edp_scrape()