Merge remote-tracking branch 'Gluejar/master' into production

pull/91/head
eric 2018-04-27 11:10:09 -04:00
commit 8a9260532e
2 changed files with 36 additions and 20 deletions

View File

@ -16,10 +16,10 @@ from .smashwords import SmashwordsScraper
def get_scraper(url): def get_scraper(url):
scrapers = [ scrapers = [
PressbooksScraper, PressbooksScraper,
HathitrustScraper,
SpringerScraper, SpringerScraper,
UbiquityScraper, UbiquityScraper,
SmashwordsScraper, SmashwordsScraper,
HathitrustScraper,
BaseScraper, BaseScraper,
] ]
for scraper in scrapers: for scraper in scrapers:

View File

@ -26,38 +26,54 @@ class HathitrustScraper(BaseScraper):
for record in records: for record in records:
self.record = record self.record = record
return return
self.record = {} self.record = None # probably a hdl not pointing at Hathitrust
self.record = None
def get_downloads(self): def get_downloads(self):
dl_a = self.doc.select_one('#fullPdfLink') if self.record:
value = dl_a['href'] if dl_a else None dl_a = self.doc.select_one('#fullPdfLink')
if value: value = dl_a['href'] if dl_a else None
self.set( if value:
'download_url_{}'.format('pdf'), self.set(
'https://babel.hathitrust.org{}'.format(value) 'download_url_{}'.format('pdf'),
) 'https://babel.hathitrust.org{}'.format(value)
)
return super(HathitrustScraper, self).get_downloads()
def get_isbns(self): def get_isbns(self):
isbn = self.record.get('issn', []) if self.record:
value = identifier_cleaner('isbn', quiet=True)(isbn) isbn = self.record.get('issn', [])
return {'print': value} if value else {} value = identifier_cleaner('isbn', quiet=True)(isbn)
return {'print': value} if value else {}
return super(HathitrustScraper, self).get_isbns()
def get_title(self): def get_title(self):
self.set('title', self.record.get('title', '')) if self.record:
self.set('title', self.record.get('title', ''))
return super(HathitrustScraper, self).get_title()
def get_keywords(self): def get_keywords(self):
self.set('subjects', self.record.get('keywords', [])) if self.record:
self.set('subjects', self.record.get('keywords', []))
return super(HathitrustScraper, self).get_keywords()
def get_publisher(self): def get_publisher(self):
self.set('publisher', self.record.get('publisher', '')) if self.record:
self.set('publisher', self.record.get('publisher', ''))
return super(HathitrustScraper, self).get_publisher()
def get_pubdate(self): def get_pubdate(self):
self.set('publication_date', self.record.get('year', '')) if self.record:
self.set('publication_date', self.record.get('year', ''))
return super(HathitrustScraper, self).get_pubdate()
def get_description(self): def get_description(self):
notes = self.record.get('notes', []) if self.record:
self.set('description', '\r'.join(notes)) notes = self.record.get('notes', [])
self.set('description', '\r'.join(notes))
return super(HathitrustScraper, self).get_description()
def get_genre(self): def get_genre(self):
self.set('genre', self.record.get('type_of_reference', '').lower()) if self.record:
self.set('genre', self.record.get('type_of_reference', '').lower())
return super(HathitrustScraper, self).get_genre()