From 59388933a9388f04a50eb531eee0ffce0ac9d2f1 Mon Sep 17 00:00:00 2001 From: eric Date: Wed, 3 Jan 2018 13:58:45 -0500 Subject: [PATCH] one scraper per file --- core/loaders/__init__.py | 4 +- core/loaders/hathitrust.py | 63 ++++++++++++++++++++++++++ core/loaders/pressbooks.py | 43 ++++++++++++++++++ core/loaders/scrape.py | 92 -------------------------------------- 4 files changed, 109 insertions(+), 93 deletions(-) create mode 100644 core/loaders/hathitrust.py create mode 100644 core/loaders/pressbooks.py diff --git a/core/loaders/__init__.py b/core/loaders/__init__.py index 57eddf25..1759f600 100755 --- a/core/loaders/__init__.py +++ b/core/loaders/__init__.py @@ -6,7 +6,9 @@ from django.conf import settings from gitenberg.metadata.pandata import Pandata from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader -from .scrape import PressbooksScraper, HathitrustScraper, BaseScraper +from .scrape import BaseScraper +from .hathitrust import HathitrustScraper +from .pressbooks import PressbooksScraper from .springer import SpringerScraper from .ubiquity import UbiquityScraper diff --git a/core/loaders/hathitrust.py b/core/loaders/hathitrust.py new file mode 100644 index 00000000..6b76f851 --- /dev/null +++ b/core/loaders/hathitrust.py @@ -0,0 +1,63 @@ +import re + +import requests +from RISparser import read as readris + +from django.conf import settings + +from regluit.core.validation import identifier_cleaner + +from .scrape import BaseScraper + + +class HathitrustScraper(BaseScraper): + + can_scrape_hosts = ['hathitrust.org'] + can_scrape_strings = ['hdl.handle.net/2027/'] + CATALOG = re.compile(r'catalog.hathitrust.org/Record/(\d+)') + + def setup(self): + catalog_a = self.doc.find('a', href=self.CATALOG) + if catalog_a: + catalog_num = self.CATALOG.search(catalog_a['href']).group(1) + ris_url = 'https://catalog.hathitrust.org/Search/SearchExport?handpicked={}&method=ris'.format(catalog_num) + response = requests.get(ris_url, headers={"User-Agent": settings.USER_AGENT}) + records = readris(response.text.splitlines()) if response.status_code == 200 else [] + for record in records: + self.record = record + return + self.record = {} + + + def get_downloads(self): + dl_a = self.doc.select_one('#fullPdfLink') + value = dl_a['href'] if dl_a else None + if value: + self.set( + 'download_url_{}'.format('pdf'), + 'https://babel.hathitrust.org{}'.format(value) + ) + + def get_isbns(self): + isbn = self.record.get('issn', []) + value = identifier_cleaner('isbn', quiet=True)(isbn) + return {'print': value} if value else {} + + def get_title(self): + self.set('title', self.record.get('title', '')) + + def get_keywords(self): + self.set('subjects', self.record.get('keywords', [])) + + def get_publisher(self): + self.set('publisher', self.record.get('publisher', '')) + + def get_pubdate(self): + self.set('publication_date', self.record.get('year', '')) + + def get_description(self): + notes = self.record.get('notes', []) + self.set('description', '\r'.join(notes)) + + def get_genre(self): + self.set('genre', self.record.get('type_of_reference', '').lower()) diff --git a/core/loaders/pressbooks.py b/core/loaders/pressbooks.py new file mode 100644 index 00000000..47291e89 --- /dev/null +++ b/core/loaders/pressbooks.py @@ -0,0 +1,43 @@ +from regluit.core.validation import identifier_cleaner +from . import BaseScraper + +class PressbooksScraper(BaseScraper): + can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu', + 'press.rebus.community', 'pb.unizin.org'] + can_scrape_strings = ['pressbooks'] + + def get_downloads(self): + for dl_type in ['epub', 'mobi', 'pdf']: + download_el = self.doc.select_one('.{}'.format(dl_type)) + if download_el and download_el.find_parent(): + value = download_el.find_parent().get('href') + if value: + self.set('download_url_{}'.format(dl_type), value) + + def get_publisher(self): + value = self.get_dt_dd('Publisher') + if not value: + value = self.doc.select_one('.cie-name') + value = value.text if value else None + if value: + self.set('publisher', value) + else: + super(PressbooksScraper, self).get_publisher() + + def get_title(self): + value = self.doc.select_one('.entry-title a[title]') + value = value['title'] if value else None + if value: + self.set('title', value) + else: + super(PressbooksScraper, self).get_title() + + def get_isbns(self): + '''add isbn identifiers and return a dict of edition keys and ISBNs''' + isbns = {} + for (key, label) in [('electronic', 'Ebook ISBN'), ('paper', 'Print ISBN')]: + isbn = identifier_cleaner('isbn', quiet=True)(self.get_dt_dd(label)) + if isbn: + self.identifiers['isbn_{}'.format(key)] = isbn + isbns[key] = isbn + return isbns diff --git a/core/loaders/scrape.py b/core/loaders/scrape.py index bde8f959..5dafa685 100644 --- a/core/loaders/scrape.py +++ b/core/loaders/scrape.py @@ -6,7 +6,6 @@ from bs4 import BeautifulSoup #from gitenberg.metadata.pandata import Pandata from django.conf import settings from urlparse import urljoin -from RISparser import read as readris from regluit.core import models from regluit.core.validation import authlist_cleaner, identifier_cleaner, validate_date @@ -318,97 +317,6 @@ class BaseScraper(object): self.set('rights_url', link['href']) -class PressbooksScraper(BaseScraper): - can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu', - 'press.rebus.community', 'pb.unizin.org'] - can_scrape_strings = ['pressbooks'] - - def get_downloads(self): - for dl_type in ['epub', 'mobi', 'pdf']: - download_el = self.doc.select_one('.{}'.format(dl_type)) - if download_el and download_el.find_parent(): - value = download_el.find_parent().get('href') - if value: - self.set('download_url_{}'.format(dl_type), value) - - def get_publisher(self): - value = self.get_dt_dd('Publisher') - if not value: - value = self.doc.select_one('.cie-name') - value = value.text if value else None - if value: - self.set('publisher', value) - else: - super(PressbooksScraper, self).get_publisher() - - def get_title(self): - value = self.doc.select_one('.entry-title a[title]') - value = value['title'] if value else None - if value: - self.set('title', value) - else: - super(PressbooksScraper, self).get_title() - - def get_isbns(self): - '''add isbn identifiers and return a dict of edition keys and ISBNs''' - isbns = {} - for (key, label) in [('electronic', 'Ebook ISBN'), ('paper', 'Print ISBN')]: - isbn = identifier_cleaner('isbn', quiet=True)(self.get_dt_dd(label)) - if isbn: - self.identifiers['isbn_{}'.format(key)] = isbn - isbns[key] = isbn - return isbns -class HathitrustScraper(BaseScraper): - - can_scrape_hosts = ['hathitrust.org'] - can_scrape_strings = ['hdl.handle.net/2027/'] - CATALOG = re.compile(r'catalog.hathitrust.org/Record/(\d+)') - - def setup(self): - catalog_a = self.doc.find('a', href=self.CATALOG) - if catalog_a: - catalog_num = self.CATALOG.search(catalog_a['href']).group(1) - ris_url = 'https://catalog.hathitrust.org/Search/SearchExport?handpicked={}&method=ris'.format(catalog_num) - response = requests.get(ris_url, headers={"User-Agent": settings.USER_AGENT}) - records = readris(response.text.splitlines()) if response.status_code == 200 else [] - for record in records: - self.record = record - return - self.record = {} - - - def get_downloads(self): - dl_a = self.doc.select_one('#fullPdfLink') - value = dl_a['href'] if dl_a else None - if value: - self.set( - 'download_url_{}'.format('pdf'), - 'https://babel.hathitrust.org{}'.format(value) - ) - - def get_isbns(self): - isbn = self.record.get('issn', []) - value = identifier_cleaner('isbn', quiet=True)(isbn) - return {'print': value} if value else {} - - def get_title(self): - self.set('title', self.record.get('title', '')) - - def get_keywords(self): - self.set('subjects', self.record.get('keywords', [])) - - def get_publisher(self): - self.set('publisher', self.record.get('publisher', '')) - - def get_pubdate(self): - self.set('publication_date', self.record.get('year', '')) - - def get_description(self): - notes = self.record.get('notes', []) - self.set('description', '\r'.join(notes)) - - def get_genre(self): - self.set('genre', self.record.get('type_of_reference', '').lower())