adapt to newer pressbooks version

pull/95/head
eric 2018-08-02 17:27:04 -04:00
parent a355a873f7
commit 7efb048a5c
2 changed files with 30 additions and 11 deletions

View File

@ -1,16 +1,22 @@
import re
from regluit.core.validation import identifier_cleaner
from . import BaseScraper
class PressbooksScraper(BaseScraper):
can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu',
'press.rebus.community', 'pb.unizin.org']
can_scrape_hosts = [
'bookkernel.com', 'milnepublishing.geneseo.edu', 'press.rebus.community', 'pb.unizin.org',
'opentext.wsu.edu', 'oer.missouriwestern.edu', 'eskript.ethz.ch', 'opentext.lib.vt.edu',]
can_scrape_strings = ['pressbooks']
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
download_el = self.doc.select_one('.{}'.format(dl_type))
value = None
if download_el and download_el.find_parent():
value = download_el.find_parent().get('href')
else:
a = self.doc.find('a', href=re.compile(r'{}$'.format(dl_type)))
value = a.get('href') if a else None
if value:
self.set('download_url_{}'.format(dl_type), value)
@ -22,7 +28,9 @@ class PressbooksScraper(BaseScraper):
if value:
self.set('publisher', value)
else:
super(PressbooksScraper, self).get_publisher()
value = self.check_metas(['citation_publisher', 'publisher', r'DC\.Source'])
if value:
self.set('publisher', value)
def get_title(self):
value = self.doc.select_one('.entry-title a[title]')

View File

@ -8,7 +8,12 @@ from django.conf import settings
from urlparse import urljoin
from regluit.core import models
from regluit.core.validation import authlist_cleaner, identifier_cleaner, validate_date
from regluit.core.validation import (
authlist_cleaner,
identifier_cleaner,
valid_subject,
validate_date,
)
logger = logging.getLogger(__name__)
@ -66,6 +71,8 @@ class BaseScraper(object):
#
def set(self, name, value):
if isinstance(value,(str, unicode)):
value= value.strip()
self.metadata[name] = value
def fetch_one_el_content(self, el_name):
@ -110,7 +117,7 @@ class BaseScraper(object):
''' get the content of <dd> after a <dt> containing name'''
dt = self.doc.find('dt', string=re.compile(name))
dd = dt.find_next_sibling('dd') if dt else None
return dd.text if dd else None
return dd.text.strip() if dd and dd.text else None
def get_itemprop(self, name, **attrs):
value_list = []
@ -244,7 +251,11 @@ class BaseScraper(object):
def get_keywords(self):
value = self.check_metas(['keywords']).strip(',;')
if value:
self.set('subjects', re.split(' *[;,] *', value))
subjects = []
for subject in re.split(' *[;,] *', value):
if valid_subject(subject):
subjects.append(subject)
self.set('subjects', subjects)
def get_publisher(self):
value = self.check_metas(['citation_publisher', r'DC\.Source'])
@ -255,8 +266,8 @@ class BaseScraper(object):
value = self.get_itemprop('datePublished', list_mode='one_item')
if not value:
value = self.check_metas([
'citation_publication_date', r'DC\.Date\.issued', 'datePublished',
'books:release_date', 'book:release_date'
'citation_publication_date', 'copyrightYear', r'DC\.Date\.issued', 'datePublished',
'books:release_date', 'book:release_date',
])
if value:
value = validate_date(value)