adapt to newer pressbooks version

pull/95/head
eric 2018-08-02 17:27:04 -04:00
parent a355a873f7
commit 7efb048a5c
2 changed files with 30 additions and 11 deletions

View File

@ -1,18 +1,24 @@
import re
from regluit.core.validation import identifier_cleaner from regluit.core.validation import identifier_cleaner
from . import BaseScraper from . import BaseScraper
class PressbooksScraper(BaseScraper): class PressbooksScraper(BaseScraper):
can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu', can_scrape_hosts = [
'press.rebus.community', 'pb.unizin.org'] 'bookkernel.com', 'milnepublishing.geneseo.edu', 'press.rebus.community', 'pb.unizin.org',
'opentext.wsu.edu', 'oer.missouriwestern.edu', 'eskript.ethz.ch', 'opentext.lib.vt.edu',]
can_scrape_strings = ['pressbooks'] can_scrape_strings = ['pressbooks']
def get_downloads(self): def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']: for dl_type in ['epub', 'mobi', 'pdf']:
download_el = self.doc.select_one('.{}'.format(dl_type)) download_el = self.doc.select_one('.{}'.format(dl_type))
value = None
if download_el and download_el.find_parent(): if download_el and download_el.find_parent():
value = download_el.find_parent().get('href') value = download_el.find_parent().get('href')
if value: else:
self.set('download_url_{}'.format(dl_type), value) a = self.doc.find('a', href=re.compile(r'{}$'.format(dl_type)))
value = a.get('href') if a else None
if value:
self.set('download_url_{}'.format(dl_type), value)
def get_publisher(self): def get_publisher(self):
value = self.get_dt_dd('Publisher') value = self.get_dt_dd('Publisher')
@ -22,8 +28,10 @@ class PressbooksScraper(BaseScraper):
if value: if value:
self.set('publisher', value) self.set('publisher', value)
else: else:
super(PressbooksScraper, self).get_publisher() value = self.check_metas(['citation_publisher', 'publisher', r'DC\.Source'])
if value:
self.set('publisher', value)
def get_title(self): def get_title(self):
value = self.doc.select_one('.entry-title a[title]') value = self.doc.select_one('.entry-title a[title]')
value = value['title'] if value else None value = value['title'] if value else None

View File

@ -8,7 +8,12 @@ from django.conf import settings
from urlparse import urljoin from urlparse import urljoin
from regluit.core import models from regluit.core import models
from regluit.core.validation import authlist_cleaner, identifier_cleaner, validate_date from regluit.core.validation import (
authlist_cleaner,
identifier_cleaner,
valid_subject,
validate_date,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -66,6 +71,8 @@ class BaseScraper(object):
# #
def set(self, name, value): def set(self, name, value):
if isinstance(value,(str, unicode)):
value= value.strip()
self.metadata[name] = value self.metadata[name] = value
def fetch_one_el_content(self, el_name): def fetch_one_el_content(self, el_name):
@ -110,7 +117,7 @@ class BaseScraper(object):
''' get the content of <dd> after a <dt> containing name''' ''' get the content of <dd> after a <dt> containing name'''
dt = self.doc.find('dt', string=re.compile(name)) dt = self.doc.find('dt', string=re.compile(name))
dd = dt.find_next_sibling('dd') if dt else None dd = dt.find_next_sibling('dd') if dt else None
return dd.text if dd else None return dd.text.strip() if dd and dd.text else None
def get_itemprop(self, name, **attrs): def get_itemprop(self, name, **attrs):
value_list = [] value_list = []
@ -244,7 +251,11 @@ class BaseScraper(object):
def get_keywords(self): def get_keywords(self):
value = self.check_metas(['keywords']).strip(',;') value = self.check_metas(['keywords']).strip(',;')
if value: if value:
self.set('subjects', re.split(' *[;,] *', value)) subjects = []
for subject in re.split(' *[;,] *', value):
if valid_subject(subject):
subjects.append(subject)
self.set('subjects', subjects)
def get_publisher(self): def get_publisher(self):
value = self.check_metas(['citation_publisher', r'DC\.Source']) value = self.check_metas(['citation_publisher', r'DC\.Source'])
@ -255,8 +266,8 @@ class BaseScraper(object):
value = self.get_itemprop('datePublished', list_mode='one_item') value = self.get_itemprop('datePublished', list_mode='one_item')
if not value: if not value:
value = self.check_metas([ value = self.check_metas([
'citation_publication_date', r'DC\.Date\.issued', 'datePublished', 'citation_publication_date', 'copyrightYear', r'DC\.Date\.issued', 'datePublished',
'books:release_date', 'book:release_date' 'books:release_date', 'book:release_date',
]) ])
if value: if value:
value = validate_date(value) value = validate_date(value)