adapt to newer pressbooks version
parent
a355a873f7
commit
7efb048a5c
|
@ -1,18 +1,24 @@
|
|||
import re
|
||||
from regluit.core.validation import identifier_cleaner
|
||||
from . import BaseScraper
|
||||
|
||||
class PressbooksScraper(BaseScraper):
|
||||
can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu',
|
||||
'press.rebus.community', 'pb.unizin.org']
|
||||
can_scrape_hosts = [
|
||||
'bookkernel.com', 'milnepublishing.geneseo.edu', 'press.rebus.community', 'pb.unizin.org',
|
||||
'opentext.wsu.edu', 'oer.missouriwestern.edu', 'eskript.ethz.ch', 'opentext.lib.vt.edu',]
|
||||
can_scrape_strings = ['pressbooks']
|
||||
|
||||
def get_downloads(self):
|
||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||
download_el = self.doc.select_one('.{}'.format(dl_type))
|
||||
value = None
|
||||
if download_el and download_el.find_parent():
|
||||
value = download_el.find_parent().get('href')
|
||||
if value:
|
||||
self.set('download_url_{}'.format(dl_type), value)
|
||||
else:
|
||||
a = self.doc.find('a', href=re.compile(r'{}$'.format(dl_type)))
|
||||
value = a.get('href') if a else None
|
||||
if value:
|
||||
self.set('download_url_{}'.format(dl_type), value)
|
||||
|
||||
def get_publisher(self):
|
||||
value = self.get_dt_dd('Publisher')
|
||||
|
@ -22,8 +28,10 @@ class PressbooksScraper(BaseScraper):
|
|||
if value:
|
||||
self.set('publisher', value)
|
||||
else:
|
||||
super(PressbooksScraper, self).get_publisher()
|
||||
|
||||
value = self.check_metas(['citation_publisher', 'publisher', r'DC\.Source'])
|
||||
if value:
|
||||
self.set('publisher', value)
|
||||
|
||||
def get_title(self):
|
||||
value = self.doc.select_one('.entry-title a[title]')
|
||||
value = value['title'] if value else None
|
||||
|
|
|
@ -8,7 +8,12 @@ from django.conf import settings
|
|||
from urlparse import urljoin
|
||||
|
||||
from regluit.core import models
|
||||
from regluit.core.validation import authlist_cleaner, identifier_cleaner, validate_date
|
||||
from regluit.core.validation import (
|
||||
authlist_cleaner,
|
||||
identifier_cleaner,
|
||||
valid_subject,
|
||||
validate_date,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -66,6 +71,8 @@ class BaseScraper(object):
|
|||
#
|
||||
|
||||
def set(self, name, value):
|
||||
if isinstance(value,(str, unicode)):
|
||||
value= value.strip()
|
||||
self.metadata[name] = value
|
||||
|
||||
def fetch_one_el_content(self, el_name):
|
||||
|
@ -110,7 +117,7 @@ class BaseScraper(object):
|
|||
''' get the content of <dd> after a <dt> containing name'''
|
||||
dt = self.doc.find('dt', string=re.compile(name))
|
||||
dd = dt.find_next_sibling('dd') if dt else None
|
||||
return dd.text if dd else None
|
||||
return dd.text.strip() if dd and dd.text else None
|
||||
|
||||
def get_itemprop(self, name, **attrs):
|
||||
value_list = []
|
||||
|
@ -244,7 +251,11 @@ class BaseScraper(object):
|
|||
def get_keywords(self):
|
||||
value = self.check_metas(['keywords']).strip(',;')
|
||||
if value:
|
||||
self.set('subjects', re.split(' *[;,] *', value))
|
||||
subjects = []
|
||||
for subject in re.split(' *[;,] *', value):
|
||||
if valid_subject(subject):
|
||||
subjects.append(subject)
|
||||
self.set('subjects', subjects)
|
||||
|
||||
def get_publisher(self):
|
||||
value = self.check_metas(['citation_publisher', r'DC\.Source'])
|
||||
|
@ -255,8 +266,8 @@ class BaseScraper(object):
|
|||
value = self.get_itemprop('datePublished', list_mode='one_item')
|
||||
if not value:
|
||||
value = self.check_metas([
|
||||
'citation_publication_date', r'DC\.Date\.issued', 'datePublished',
|
||||
'books:release_date', 'book:release_date'
|
||||
'citation_publication_date', 'copyrightYear', r'DC\.Date\.issued', 'datePublished',
|
||||
'books:release_date', 'book:release_date',
|
||||
])
|
||||
if value:
|
||||
value = validate_date(value)
|
||||
|
|
Loading…
Reference in New Issue