Merge remote-tracking branch 'Gluejar/master' into production

pull/91/head
eric 2018-04-27 11:10:09 -04:00
commit 8a9260532e
2 changed files with 36 additions and 20 deletions

View File

@ -16,10 +16,10 @@ from .smashwords import SmashwordsScraper
def get_scraper(url): def get_scraper(url):
scrapers = [ scrapers = [
PressbooksScraper, PressbooksScraper,
HathitrustScraper,
SpringerScraper, SpringerScraper,
UbiquityScraper, UbiquityScraper,
SmashwordsScraper, SmashwordsScraper,
HathitrustScraper,
BaseScraper, BaseScraper,
] ]
for scraper in scrapers: for scraper in scrapers:

View File

@ -26,10 +26,11 @@ class HathitrustScraper(BaseScraper):
for record in records: for record in records:
self.record = record self.record = record
return return
self.record = {} self.record = None # probably a hdl not pointing at Hathitrust
self.record = None
def get_downloads(self): def get_downloads(self):
if self.record:
dl_a = self.doc.select_one('#fullPdfLink') dl_a = self.doc.select_one('#fullPdfLink')
value = dl_a['href'] if dl_a else None value = dl_a['href'] if dl_a else None
if value: if value:
@ -37,27 +38,42 @@ class HathitrustScraper(BaseScraper):
'download_url_{}'.format('pdf'), 'download_url_{}'.format('pdf'),
'https://babel.hathitrust.org{}'.format(value) 'https://babel.hathitrust.org{}'.format(value)
) )
return super(HathitrustScraper, self).get_downloads()
def get_isbns(self): def get_isbns(self):
if self.record:
isbn = self.record.get('issn', []) isbn = self.record.get('issn', [])
value = identifier_cleaner('isbn', quiet=True)(isbn) value = identifier_cleaner('isbn', quiet=True)(isbn)
return {'print': value} if value else {} return {'print': value} if value else {}
return super(HathitrustScraper, self).get_isbns()
def get_title(self): def get_title(self):
if self.record:
self.set('title', self.record.get('title', '')) self.set('title', self.record.get('title', ''))
return super(HathitrustScraper, self).get_title()
def get_keywords(self): def get_keywords(self):
if self.record:
self.set('subjects', self.record.get('keywords', [])) self.set('subjects', self.record.get('keywords', []))
return super(HathitrustScraper, self).get_keywords()
def get_publisher(self): def get_publisher(self):
if self.record:
self.set('publisher', self.record.get('publisher', '')) self.set('publisher', self.record.get('publisher', ''))
return super(HathitrustScraper, self).get_publisher()
def get_pubdate(self): def get_pubdate(self):
if self.record:
self.set('publication_date', self.record.get('year', '')) self.set('publication_date', self.record.get('year', ''))
return super(HathitrustScraper, self).get_pubdate()
def get_description(self): def get_description(self):
if self.record:
notes = self.record.get('notes', []) notes = self.record.get('notes', [])
self.set('description', '\r'.join(notes)) self.set('description', '\r'.join(notes))
return super(HathitrustScraper, self).get_description()
def get_genre(self): def get_genre(self):
if self.record:
self.set('genre', self.record.get('type_of_reference', '').lower()) self.set('genre', self.record.get('type_of_reference', '').lower())
return super(HathitrustScraper, self).get_genre()