pull/46/head
eric 2017-12-07 17:36:08 -05:00
parent 5ccd7a0a47
commit d53b3bcc8d
1 changed files with 30 additions and 30 deletions

View File

@ -16,7 +16,7 @@ CONTAINS_COVER = re.compile('cover')
CONTAINS_CC = re.compile('creativecommons.org')
CONTAINS_OCLCNUM = re.compile('worldcat.org/oclc/(\d+)')
class BaseScraper(object):
class BaseScraper(object):
'''
designed to make at least a decent gues for webpages that embed metadata
'''
@ -57,23 +57,23 @@ class BaseScraper(object):
#
# utilities
#
def set(self, name, value):
self.metadata[name] = value
def fetch_one_el_content(self, el_name):
data_el = self.doc.find(el_name)
value = ''
if data_el:
value = data_el.text
return value
return value
def check_metas(self, meta_list, **attrs):
value = ''
list_mode = attrs.pop('list_mode', 'longest')
for meta_name in meta_list:
attrs['name'] = meta_name
metas = self.doc.find_all('meta', attrs=attrs)
if len(metas) == 0:
# some sites put schema.org metadata in metas
@ -88,19 +88,19 @@ class BaseScraper(object):
value = el_value
elif list_mode == 'list':
if value == '':
value = [el_value]
value = [el_value]
else:
value.append(el_value)
if value:
return value
return value
return value
def get_dt_dd(self, name):
''' get the content of <dd> after a <dt> containing name'''
dt = self.doc.find('dt', string=re.compile(name))
dd = dt.find_next_sibling('dd') if dt else None
return dd.text if dd else None
def get_itemprop(self, name, **attrs):
value_list = []
list_mode = attrs.pop('list_mode', 'list')
@ -126,14 +126,14 @@ class BaseScraper(object):
def get_genre(self):
value = self.check_metas(['DC.Type', 'dc.type', 'og:type'])
if value and value in ('Text.Book', 'book'):
self.set('genre', 'book')
self.set('genre', 'book')
def get_title(self):
value = self.check_metas(['DC.Title', 'dc.title', 'citation_title', 'og:title', 'title'])
if not value:
value = self.fetch_one_el_content('title')
self.set('title', value)
def get_language(self):
value = self.check_metas(['DC.Language', 'dc.language', 'language', 'inLanguage'])
self.set('language', value)
@ -151,7 +151,7 @@ class BaseScraper(object):
'''return a dict of edition keys and ISBNs'''
isbns = {}
isbn_cleaner = identifier_cleaner('isbn', quiet=True)
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
for key in label_map.keys():
isbn_key = 'isbn_{}'.format(key)
@ -180,7 +180,7 @@ class BaseScraper(object):
if value:
self.identifiers['doi'] = value
#look for oclc numbers
#look for oclc numbers
links = self.doc.find_all(href=CONTAINS_OCLCNUM)
for link in links:
oclcmatch = CONTAINS_OCLCNUM.search(link['href'])
@ -212,12 +212,12 @@ class BaseScraper(object):
})
if len(ed_list):
self.set('edition_list', ed_list)
def get_keywords(self):
value = self.check_metas(['keywords']).strip(',;')
if value:
self.set('subjects', re.split(' *[;,] *', value))
def get_publisher(self):
value = self.check_metas(['citation_publisher', 'DC.Source'])
if value:
@ -272,14 +272,14 @@ class BaseScraper(object):
if not image_url.startswith('http'):
image_url = urljoin(self.base, image_url)
self.set('covers', [{'image_url': image_url}])
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
dl_meta = 'citation_{}_url'.format(dl_type)
value = self.check_metas([dl_meta])
if value:
self.set('download_url_{}'.format(dl_type), value)
def get_license(self):
'''only looks for cc licenses'''
links = self.doc.find_all(href=CONTAINS_CC)
@ -290,13 +290,13 @@ class BaseScraper(object):
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return True
class PressbooksScraper(BaseScraper):
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
download_el = self.doc.select_one('.{}'.format(dl_type))
if download_el and download_el.find_parent():
value = download_el.find_parent().get('href')
value = download_el.find_parent().get('href')
if value:
self.set('download_url_{}'.format(dl_type), value)
@ -309,7 +309,7 @@ class PressbooksScraper(BaseScraper):
self.set('publisher', value)
else:
super(PressbooksScraper, self).get_publisher()
def get_title(self):
value = self.doc.select_one('.entry-title a[title]')
value = value['title'] if value else None
@ -327,10 +327,10 @@ class PressbooksScraper(BaseScraper):
self.identifiers['isbn_{}'.format(key)] = isbn
isbns[key] = isbn
return isbns
@classmethod
def can_scrape(cls, url):
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
'press.rebus.community','pb.unizin.org']
''' return True if the class can scrape the URL '''
for site in pb_sites:
@ -340,7 +340,7 @@ class PressbooksScraper(BaseScraper):
class HathitrustScraper(BaseScraper):
CATALOG = re.compile(r'catalog.hathitrust.org/Record/(\d+)')
def setup(self):
@ -354,28 +354,28 @@ class HathitrustScraper(BaseScraper):
self.record = record
return
self.record = {}
def get_downloads(self):
dl_a = self.doc.select_one('#fullPdfLink')
value = dl_a['href'] if dl_a else None
if value:
self.set(
'download_url_{}'.format('pdf'),
'download_url_{}'.format('pdf'),
'https://babel.hathitrust.org{}'.format(value)
)
def get_isbns(self):
isbn = self.record.get('issn', [])
value = identifier_cleaner('isbn', quiet=True)(isbn)
return {'print': value} if value else {}
def get_title(self):
self.set('title', self.record.get('title', ''))
def get_keywords(self):
self.set('subjects', self.record.get('keywords', []))
def get_publisher(self):
self.set('publisher', self.record.get('publisher', ''))
@ -385,7 +385,7 @@ class HathitrustScraper(BaseScraper):
def get_description(self):
notes = self.record.get('notes', [])
self.set('description', '\r'.join(notes))
def get_genre(self):
self.set('genre', self.record.get('type_of_reference', '').lower())