adapt to newer pressbooks version
parent
a355a873f7
commit
7efb048a5c
|
@ -1,18 +1,24 @@
|
||||||
|
import re
|
||||||
from regluit.core.validation import identifier_cleaner
|
from regluit.core.validation import identifier_cleaner
|
||||||
from . import BaseScraper
|
from . import BaseScraper
|
||||||
|
|
||||||
class PressbooksScraper(BaseScraper):
|
class PressbooksScraper(BaseScraper):
|
||||||
can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu',
|
can_scrape_hosts = [
|
||||||
'press.rebus.community', 'pb.unizin.org']
|
'bookkernel.com', 'milnepublishing.geneseo.edu', 'press.rebus.community', 'pb.unizin.org',
|
||||||
|
'opentext.wsu.edu', 'oer.missouriwestern.edu', 'eskript.ethz.ch', 'opentext.lib.vt.edu',]
|
||||||
can_scrape_strings = ['pressbooks']
|
can_scrape_strings = ['pressbooks']
|
||||||
|
|
||||||
def get_downloads(self):
|
def get_downloads(self):
|
||||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||||
download_el = self.doc.select_one('.{}'.format(dl_type))
|
download_el = self.doc.select_one('.{}'.format(dl_type))
|
||||||
|
value = None
|
||||||
if download_el and download_el.find_parent():
|
if download_el and download_el.find_parent():
|
||||||
value = download_el.find_parent().get('href')
|
value = download_el.find_parent().get('href')
|
||||||
if value:
|
else:
|
||||||
self.set('download_url_{}'.format(dl_type), value)
|
a = self.doc.find('a', href=re.compile(r'{}$'.format(dl_type)))
|
||||||
|
value = a.get('href') if a else None
|
||||||
|
if value:
|
||||||
|
self.set('download_url_{}'.format(dl_type), value)
|
||||||
|
|
||||||
def get_publisher(self):
|
def get_publisher(self):
|
||||||
value = self.get_dt_dd('Publisher')
|
value = self.get_dt_dd('Publisher')
|
||||||
|
@ -22,8 +28,10 @@ class PressbooksScraper(BaseScraper):
|
||||||
if value:
|
if value:
|
||||||
self.set('publisher', value)
|
self.set('publisher', value)
|
||||||
else:
|
else:
|
||||||
super(PressbooksScraper, self).get_publisher()
|
value = self.check_metas(['citation_publisher', 'publisher', r'DC\.Source'])
|
||||||
|
if value:
|
||||||
|
self.set('publisher', value)
|
||||||
|
|
||||||
def get_title(self):
|
def get_title(self):
|
||||||
value = self.doc.select_one('.entry-title a[title]')
|
value = self.doc.select_one('.entry-title a[title]')
|
||||||
value = value['title'] if value else None
|
value = value['title'] if value else None
|
||||||
|
|
|
@ -8,7 +8,12 @@ from django.conf import settings
|
||||||
from urlparse import urljoin
|
from urlparse import urljoin
|
||||||
|
|
||||||
from regluit.core import models
|
from regluit.core import models
|
||||||
from regluit.core.validation import authlist_cleaner, identifier_cleaner, validate_date
|
from regluit.core.validation import (
|
||||||
|
authlist_cleaner,
|
||||||
|
identifier_cleaner,
|
||||||
|
valid_subject,
|
||||||
|
validate_date,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -66,6 +71,8 @@ class BaseScraper(object):
|
||||||
#
|
#
|
||||||
|
|
||||||
def set(self, name, value):
|
def set(self, name, value):
|
||||||
|
if isinstance(value,(str, unicode)):
|
||||||
|
value= value.strip()
|
||||||
self.metadata[name] = value
|
self.metadata[name] = value
|
||||||
|
|
||||||
def fetch_one_el_content(self, el_name):
|
def fetch_one_el_content(self, el_name):
|
||||||
|
@ -110,7 +117,7 @@ class BaseScraper(object):
|
||||||
''' get the content of <dd> after a <dt> containing name'''
|
''' get the content of <dd> after a <dt> containing name'''
|
||||||
dt = self.doc.find('dt', string=re.compile(name))
|
dt = self.doc.find('dt', string=re.compile(name))
|
||||||
dd = dt.find_next_sibling('dd') if dt else None
|
dd = dt.find_next_sibling('dd') if dt else None
|
||||||
return dd.text if dd else None
|
return dd.text.strip() if dd and dd.text else None
|
||||||
|
|
||||||
def get_itemprop(self, name, **attrs):
|
def get_itemprop(self, name, **attrs):
|
||||||
value_list = []
|
value_list = []
|
||||||
|
@ -244,7 +251,11 @@ class BaseScraper(object):
|
||||||
def get_keywords(self):
|
def get_keywords(self):
|
||||||
value = self.check_metas(['keywords']).strip(',;')
|
value = self.check_metas(['keywords']).strip(',;')
|
||||||
if value:
|
if value:
|
||||||
self.set('subjects', re.split(' *[;,] *', value))
|
subjects = []
|
||||||
|
for subject in re.split(' *[;,] *', value):
|
||||||
|
if valid_subject(subject):
|
||||||
|
subjects.append(subject)
|
||||||
|
self.set('subjects', subjects)
|
||||||
|
|
||||||
def get_publisher(self):
|
def get_publisher(self):
|
||||||
value = self.check_metas(['citation_publisher', r'DC\.Source'])
|
value = self.check_metas(['citation_publisher', r'DC\.Source'])
|
||||||
|
@ -255,8 +266,8 @@ class BaseScraper(object):
|
||||||
value = self.get_itemprop('datePublished', list_mode='one_item')
|
value = self.get_itemprop('datePublished', list_mode='one_item')
|
||||||
if not value:
|
if not value:
|
||||||
value = self.check_metas([
|
value = self.check_metas([
|
||||||
'citation_publication_date', r'DC\.Date\.issued', 'datePublished',
|
'citation_publication_date', 'copyrightYear', r'DC\.Date\.issued', 'datePublished',
|
||||||
'books:release_date', 'book:release_date'
|
'books:release_date', 'book:release_date',
|
||||||
])
|
])
|
||||||
if value:
|
if value:
|
||||||
value = validate_date(value)
|
value = validate_date(value)
|
||||||
|
|
Loading…
Reference in New Issue