delint
parent
5ccd7a0a47
commit
d53b3bcc8d
|
@ -16,7 +16,7 @@ CONTAINS_COVER = re.compile('cover')
|
|||
CONTAINS_CC = re.compile('creativecommons.org')
|
||||
CONTAINS_OCLCNUM = re.compile('worldcat.org/oclc/(\d+)')
|
||||
|
||||
class BaseScraper(object):
|
||||
class BaseScraper(object):
|
||||
'''
|
||||
designed to make at least a decent gues for webpages that embed metadata
|
||||
'''
|
||||
|
@ -57,23 +57,23 @@ class BaseScraper(object):
|
|||
#
|
||||
# utilities
|
||||
#
|
||||
|
||||
|
||||
def set(self, name, value):
|
||||
self.metadata[name] = value
|
||||
|
||||
|
||||
def fetch_one_el_content(self, el_name):
|
||||
data_el = self.doc.find(el_name)
|
||||
value = ''
|
||||
if data_el:
|
||||
value = data_el.text
|
||||
return value
|
||||
|
||||
return value
|
||||
|
||||
def check_metas(self, meta_list, **attrs):
|
||||
value = ''
|
||||
list_mode = attrs.pop('list_mode', 'longest')
|
||||
for meta_name in meta_list:
|
||||
attrs['name'] = meta_name
|
||||
|
||||
|
||||
metas = self.doc.find_all('meta', attrs=attrs)
|
||||
if len(metas) == 0:
|
||||
# some sites put schema.org metadata in metas
|
||||
|
@ -88,19 +88,19 @@ class BaseScraper(object):
|
|||
value = el_value
|
||||
elif list_mode == 'list':
|
||||
if value == '':
|
||||
value = [el_value]
|
||||
value = [el_value]
|
||||
else:
|
||||
value.append(el_value)
|
||||
if value:
|
||||
return value
|
||||
return value
|
||||
|
||||
return value
|
||||
|
||||
def get_dt_dd(self, name):
|
||||
''' get the content of <dd> after a <dt> containing name'''
|
||||
dt = self.doc.find('dt', string=re.compile(name))
|
||||
dd = dt.find_next_sibling('dd') if dt else None
|
||||
return dd.text if dd else None
|
||||
|
||||
|
||||
def get_itemprop(self, name, **attrs):
|
||||
value_list = []
|
||||
list_mode = attrs.pop('list_mode', 'list')
|
||||
|
@ -126,14 +126,14 @@ class BaseScraper(object):
|
|||
def get_genre(self):
|
||||
value = self.check_metas(['DC.Type', 'dc.type', 'og:type'])
|
||||
if value and value in ('Text.Book', 'book'):
|
||||
self.set('genre', 'book')
|
||||
self.set('genre', 'book')
|
||||
|
||||
def get_title(self):
|
||||
value = self.check_metas(['DC.Title', 'dc.title', 'citation_title', 'og:title', 'title'])
|
||||
if not value:
|
||||
value = self.fetch_one_el_content('title')
|
||||
self.set('title', value)
|
||||
|
||||
|
||||
def get_language(self):
|
||||
value = self.check_metas(['DC.Language', 'dc.language', 'language', 'inLanguage'])
|
||||
self.set('language', value)
|
||||
|
@ -151,7 +151,7 @@ class BaseScraper(object):
|
|||
'''return a dict of edition keys and ISBNs'''
|
||||
isbns = {}
|
||||
isbn_cleaner = identifier_cleaner('isbn', quiet=True)
|
||||
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
|
||||
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
|
||||
'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
|
||||
for key in label_map.keys():
|
||||
isbn_key = 'isbn_{}'.format(key)
|
||||
|
@ -180,7 +180,7 @@ class BaseScraper(object):
|
|||
if value:
|
||||
self.identifiers['doi'] = value
|
||||
|
||||
#look for oclc numbers
|
||||
#look for oclc numbers
|
||||
links = self.doc.find_all(href=CONTAINS_OCLCNUM)
|
||||
for link in links:
|
||||
oclcmatch = CONTAINS_OCLCNUM.search(link['href'])
|
||||
|
@ -212,12 +212,12 @@ class BaseScraper(object):
|
|||
})
|
||||
if len(ed_list):
|
||||
self.set('edition_list', ed_list)
|
||||
|
||||
|
||||
def get_keywords(self):
|
||||
value = self.check_metas(['keywords']).strip(',;')
|
||||
if value:
|
||||
self.set('subjects', re.split(' *[;,] *', value))
|
||||
|
||||
|
||||
def get_publisher(self):
|
||||
value = self.check_metas(['citation_publisher', 'DC.Source'])
|
||||
if value:
|
||||
|
@ -272,14 +272,14 @@ class BaseScraper(object):
|
|||
if not image_url.startswith('http'):
|
||||
image_url = urljoin(self.base, image_url)
|
||||
self.set('covers', [{'image_url': image_url}])
|
||||
|
||||
|
||||
def get_downloads(self):
|
||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||
dl_meta = 'citation_{}_url'.format(dl_type)
|
||||
value = self.check_metas([dl_meta])
|
||||
if value:
|
||||
self.set('download_url_{}'.format(dl_type), value)
|
||||
|
||||
|
||||
def get_license(self):
|
||||
'''only looks for cc licenses'''
|
||||
links = self.doc.find_all(href=CONTAINS_CC)
|
||||
|
@ -290,13 +290,13 @@ class BaseScraper(object):
|
|||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return True
|
||||
|
||||
|
||||
class PressbooksScraper(BaseScraper):
|
||||
def get_downloads(self):
|
||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||
download_el = self.doc.select_one('.{}'.format(dl_type))
|
||||
if download_el and download_el.find_parent():
|
||||
value = download_el.find_parent().get('href')
|
||||
value = download_el.find_parent().get('href')
|
||||
if value:
|
||||
self.set('download_url_{}'.format(dl_type), value)
|
||||
|
||||
|
@ -309,7 +309,7 @@ class PressbooksScraper(BaseScraper):
|
|||
self.set('publisher', value)
|
||||
else:
|
||||
super(PressbooksScraper, self).get_publisher()
|
||||
|
||||
|
||||
def get_title(self):
|
||||
value = self.doc.select_one('.entry-title a[title]')
|
||||
value = value['title'] if value else None
|
||||
|
@ -327,10 +327,10 @@ class PressbooksScraper(BaseScraper):
|
|||
self.identifiers['isbn_{}'.format(key)] = isbn
|
||||
isbns[key] = isbn
|
||||
return isbns
|
||||
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
|
||||
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
|
||||
'press.rebus.community','pb.unizin.org']
|
||||
''' return True if the class can scrape the URL '''
|
||||
for site in pb_sites:
|
||||
|
@ -340,7 +340,7 @@ class PressbooksScraper(BaseScraper):
|
|||
|
||||
|
||||
class HathitrustScraper(BaseScraper):
|
||||
|
||||
|
||||
CATALOG = re.compile(r'catalog.hathitrust.org/Record/(\d+)')
|
||||
|
||||
def setup(self):
|
||||
|
@ -354,28 +354,28 @@ class HathitrustScraper(BaseScraper):
|
|||
self.record = record
|
||||
return
|
||||
self.record = {}
|
||||
|
||||
|
||||
|
||||
def get_downloads(self):
|
||||
dl_a = self.doc.select_one('#fullPdfLink')
|
||||
value = dl_a['href'] if dl_a else None
|
||||
if value:
|
||||
self.set(
|
||||
'download_url_{}'.format('pdf'),
|
||||
'download_url_{}'.format('pdf'),
|
||||
'https://babel.hathitrust.org{}'.format(value)
|
||||
)
|
||||
|
||||
|
||||
def get_isbns(self):
|
||||
isbn = self.record.get('issn', [])
|
||||
value = identifier_cleaner('isbn', quiet=True)(isbn)
|
||||
return {'print': value} if value else {}
|
||||
|
||||
|
||||
def get_title(self):
|
||||
self.set('title', self.record.get('title', ''))
|
||||
|
||||
def get_keywords(self):
|
||||
self.set('subjects', self.record.get('keywords', []))
|
||||
|
||||
|
||||
def get_publisher(self):
|
||||
self.set('publisher', self.record.get('publisher', ''))
|
||||
|
||||
|
@ -385,7 +385,7 @@ class HathitrustScraper(BaseScraper):
|
|||
def get_description(self):
|
||||
notes = self.record.get('notes', [])
|
||||
self.set('description', '\r'.join(notes))
|
||||
|
||||
|
||||
def get_genre(self):
|
||||
self.set('genre', self.record.get('type_of_reference', '').lower())
|
||||
|
||||
|
|
Loading…
Reference in New Issue