From 7efb048a5cdb0910f6062be782bb1d8b3f9c8af0 Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 2 Aug 2018 17:27:04 -0400 Subject: [PATCH] adapt to newer pressbooks version --- core/loaders/pressbooks.py | 20 ++++++++++++++------ core/loaders/scrape.py | 21 ++++++++++++++++----- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/core/loaders/pressbooks.py b/core/loaders/pressbooks.py index 47291e89..8f7ec7b4 100644 --- a/core/loaders/pressbooks.py +++ b/core/loaders/pressbooks.py @@ -1,18 +1,24 @@ +import re from regluit.core.validation import identifier_cleaner from . import BaseScraper class PressbooksScraper(BaseScraper): - can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu', - 'press.rebus.community', 'pb.unizin.org'] + can_scrape_hosts = [ + 'bookkernel.com', 'milnepublishing.geneseo.edu', 'press.rebus.community', 'pb.unizin.org', + 'opentext.wsu.edu', 'oer.missouriwestern.edu', 'eskript.ethz.ch', 'opentext.lib.vt.edu',] can_scrape_strings = ['pressbooks'] def get_downloads(self): for dl_type in ['epub', 'mobi', 'pdf']: download_el = self.doc.select_one('.{}'.format(dl_type)) + value = None if download_el and download_el.find_parent(): value = download_el.find_parent().get('href') - if value: - self.set('download_url_{}'.format(dl_type), value) + else: + a = self.doc.find('a', href=re.compile(r'{}$'.format(dl_type))) + value = a.get('href') if a else None + if value: + self.set('download_url_{}'.format(dl_type), value) def get_publisher(self): value = self.get_dt_dd('Publisher') @@ -22,8 +28,10 @@ class PressbooksScraper(BaseScraper): if value: self.set('publisher', value) else: - super(PressbooksScraper, self).get_publisher() - + value = self.check_metas(['citation_publisher', 'publisher', r'DC\.Source']) + if value: + self.set('publisher', value) + def get_title(self): value = self.doc.select_one('.entry-title a[title]') value = value['title'] if value else None diff --git a/core/loaders/scrape.py b/core/loaders/scrape.py index 521748fd..a2cdd439 100644 --- a/core/loaders/scrape.py +++ b/core/loaders/scrape.py @@ -8,7 +8,12 @@ from django.conf import settings from urlparse import urljoin from regluit.core import models -from regluit.core.validation import authlist_cleaner, identifier_cleaner, validate_date +from regluit.core.validation import ( + authlist_cleaner, + identifier_cleaner, + valid_subject, + validate_date, +) logger = logging.getLogger(__name__) @@ -66,6 +71,8 @@ class BaseScraper(object): # def set(self, name, value): + if isinstance(value,(str, unicode)): + value= value.strip() self.metadata[name] = value def fetch_one_el_content(self, el_name): @@ -110,7 +117,7 @@ class BaseScraper(object): ''' get the content of
after a
containing name''' dt = self.doc.find('dt', string=re.compile(name)) dd = dt.find_next_sibling('dd') if dt else None - return dd.text if dd else None + return dd.text.strip() if dd and dd.text else None def get_itemprop(self, name, **attrs): value_list = [] @@ -244,7 +251,11 @@ class BaseScraper(object): def get_keywords(self): value = self.check_metas(['keywords']).strip(',;') if value: - self.set('subjects', re.split(' *[;,] *', value)) + subjects = [] + for subject in re.split(' *[;,] *', value): + if valid_subject(subject): + subjects.append(subject) + self.set('subjects', subjects) def get_publisher(self): value = self.check_metas(['citation_publisher', r'DC\.Source']) @@ -255,8 +266,8 @@ class BaseScraper(object): value = self.get_itemprop('datePublished', list_mode='one_item') if not value: value = self.check_metas([ - 'citation_publication_date', r'DC\.Date\.issued', 'datePublished', - 'books:release_date', 'book:release_date' + 'citation_publication_date', 'copyrightYear', r'DC\.Date\.issued', 'datePublished', + 'books:release_date', 'book:release_date', ]) if value: value = validate_date(value)