add springer scraper

pull/46/head
eric 2017-12-06 18:13:46 -05:00
parent 5f39729d74
commit 82784778c4
6 changed files with 191 additions and 47 deletions

View File

@ -38,7 +38,6 @@ from . import cc
from . import models
from .parameters import WORK_IDENTIFIERS
from .validation import identifier_cleaner, unreverse_name
from .loaders.scrape import get_scraper, scrape_sitemap
logger = logging.getLogger(__name__)
request_log = logging.getLogger("requests")
@ -755,7 +754,7 @@ def edition_for_ident(id_type, id_value):
#print 'returning edition for {}: {}'.format(id_type, id_value)
for ident in models.Identifier.objects.filter(type=id_type, value=id_value):
return ident.edition if ident.edition else ident.work.editions[0]
def edition_for_etype(etype, metadata, default=None):
'''
assumes the metadata contains the isbn_etype attributes, and that the editions have been created.
@ -774,7 +773,7 @@ def edition_for_etype(etype, metadata, default=None):
return edition_for_ident(key, metadata.identifiers[key])
for key in metadata.edition_identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key])
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
def load_ebookfile(url, etype):
@ -793,14 +792,14 @@ def load_ebookfile(url, etype):
logger.error(u'could not open {}'.format(url))
except ValidationError, e:
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
class BasePandataLoader(object):
def __init__(self, url):
self.base_url = url
def load_from_pandata(self, metadata, work=None):
''' metadata is a Pandata object'''
#find an work to associate
edition = None
has_ed_id = False
@ -862,7 +861,7 @@ class BasePandataLoader(object):
if metadata.description and len(metadata.description) > len(work.description):
#be careful about overwriting the work description
work.description = metadata.description
if metadata.creator and not edition.authors.count():
if metadata.creator and not edition.authors.count():
edition.authors.clear()
for key in metadata.creator.keys():
creators = metadata.creator[key]
@ -901,7 +900,7 @@ class BasePandataLoader(object):
contentfile = load_ebookfile(url, key)
if contentfile:
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
path = default_storage.save(contentfile_name, contentfile)
path = default_storage.save(contentfile_name, contentfile)
lic = MATCH_LICENSE.search(metadata.rights_url)
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
ebf = models.EbookFile.objects.create(
@ -923,8 +922,8 @@ class BasePandataLoader(object):
)
ebf.ebook = ebook
ebf.save()
class GithubLoader(BasePandataLoader):
def load_ebooks(self, metadata, edition, test_mode=False):
# create Ebook for any ebook in the corresponding GitHub release
@ -1013,21 +1012,10 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
for asset in release.iter_assets()
if EBOOK_FORMATS.get(asset.content_type) is not None]
def add_by_webpage(url, work=None, user=None):
edition = None
scraper = get_scraper(url)
loader = BasePandataLoader(url)
pandata = Pandata()
pandata.metadata = scraper.metadata
for metadata in pandata.get_edition_list():
edition = loader.load_from_pandata(metadata, work)
work = edition.work
loader.load_ebooks(pandata, edition, user=user)
return edition if edition else None
def add_by_sitemap(url, maxnum=None):
def add_from_bookdatas(bookdatas):
''' bookdatas are iterators of scrapers '''
editions = []
for bookdata in scrape_sitemap(url, maxnum=maxnum):
for bookdata in bookdatas:
edition = work = None
loader = BasePandataLoader(bookdata.base)
pandata = Pandata()
@ -1039,6 +1027,3 @@ def add_by_sitemap(url, maxnum=None):
if edition:
editions.append(edition)
return editions

View File

@ -0,0 +1,41 @@
import requests
from bs4 import BeautifulSoup
from gitenberg.metadata.pandata import Pandata
from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader
from .scrape import PressbooksScraper, HathitrustScraper, BaseScraper
from .springer import SpringerScraper
def get_scraper(url):
scrapers = [PressbooksScraper, HathitrustScraper, BaseScraper]
for scraper in scrapers:
if scraper.can_scrape(url):
return scraper(url)
def scrape_sitemap(url, maxnum=None):
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
doc = BeautifulSoup(response.content, 'lxml')
for page in doc.find_all('loc')[0:maxnum]:
scraper = get_scraper(page.text)
if scraper.metadata.get('genre', None) == 'book':
yield scraper
except requests.exceptions.RequestException as e:
logger.error(e)
def add_by_webpage(url, work=None, user=None):
edition = None
scraper = get_scraper(url)
loader = BasePandataLoader(url)
pandata = Pandata()
pandata.metadata = scraper.metadata
for metadata in pandata.get_edition_list():
edition = loader.load_from_pandata(metadata, work)
work = edition.work
loader.load_ebooks(pandata, edition, user=user)
return edition if edition else None
def add_by_sitemap(url, maxnum=None):
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))

View File

@ -230,7 +230,7 @@ class BaseScraper(object):
if value:
self.set('publication_date', value)
def get_authors(self):
def get_author_list(self):
value_list = self.check_metas([
'DC.Creator.PersonalName',
'citation_author',
@ -239,9 +239,15 @@ class BaseScraper(object):
if not value_list:
value_list = self.get_itemprop('author')
if not value_list:
return
return []
return value_list
def get_authors(self):
value_list = self.get_author_list()
creator_list = []
value_list = authlist_cleaner(value_list)
if len(value_list) == 0:
return
if len(value_list) == 1:
self.set('creator', {'author': {'agent_name': value_list[0]}})
return
@ -383,21 +389,3 @@ class HathitrustScraper(BaseScraper):
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return url.find('hathitrust.org') > 0 or url.find('hdl.handle.net/2027/') > 0
def get_scraper(url):
scrapers = [PressbooksScraper, HathitrustScraper, BaseScraper]
for scraper in scrapers:
if scraper.can_scrape(url):
return scraper(url)
def scrape_sitemap(url, maxnum=None):
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
doc = BeautifulSoup(response.content, 'lxml')
for page in doc.find_all('loc')[0:maxnum]:
scraper = get_scraper(page.text)
if scraper.metadata.get('genre', None) == 'book':
yield scraper
except requests.exceptions.RequestException as e:
logger.error(e)

118
core/loaders/springer.py Normal file
View File

@ -0,0 +1,118 @@
import re
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
from django.conf import settings
from regluit.core.validation import identifier_cleaner
from regluit.core.bookloader import add_from_bookdatas
from .scrape import BaseScraper, CONTAINS_CC
MENTIONS_CC = re.compile(r'CC BY(-NC)?(-ND|-SA)?', flags=re.I)
HAS_YEAR = re.compile(r'(19|20)\d\d')
class SpringerScraper(BaseScraper):
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
download_el = self.doc.find('a', title=re.compile(dl_type.upper()))
if download_el:
value = download_el.get('href')
if value:
value = urljoin(self.base, value)
self.set('download_url_{}'.format(dl_type), value)
def get_description(self):
desc = self.doc.select_one('#book-description')
if desc:
value = ''
for div in desc.contents:
text = div.string.replace(u'\xa0', u' ') if div.string else None
if text:
value = u'{}<p>{}</p>'.format(value, text)
self.set('description', value)
def get_keywords(self):
value = []
for kw in self.doc.select('.Keyword'):
value.append(kw.text.strip())
if value:
if 'Open Access' in value:
value.remove('Open Access')
self.set('subjects', value)
def get_identifiers(self):
super(SpringerScraper, self).get_identifiers()
el = self.doc.select_one('#doi-url')
if el:
value = identifier_cleaner('doi', quiet=True)(el.text)
if value:
self.identifiers['doi'] = value
def get_isbns(self):
isbns = {}
el = self.doc.select_one('#print-isbn')
if el:
value = identifier_cleaner('isbn', quiet=True)(el.text)
if value:
isbns['paper'] = value
el = self.doc.select_one('#electronic-isbn')
if el:
value = identifier_cleaner('isbn', quiet=True)(el.text)
if value:
isbns['electronic'] = value
return isbns
def get_title(self):
el = self.doc.select_one('#book-title')
if el:
value = el.text.strip()
if value:
value = value.replace('\n', ': ', 1)
self.set('title', value)
if not value:
(SpringerScraper, self).get_title()
def get_author_list(self):
for el in self.doc.select('.authors__name'):
yield el.text.strip().replace(u'\xa0', u' ')
def get_license(self):
'''only looks for cc licenses'''
links = self.doc.find_all(href=CONTAINS_CC)
for link in links:
self.set('rights_url', link['href'])
return
mention = self.doc.find(string=MENTIONS_CC)
if mention:
lic = MENTIONS_CC.search(mention).group(0)
lic_url = 'https://creativecommons.org/licences/{}/'.format(lic[3:].lower())
self.set('rights_url', lic_url)
def get_pubdate(self):
pubinfo = self.doc.select_one('#copyright-info')
if pubinfo:
yearmatch = HAS_YEAR.search(pubinfo.string)
if yearmatch:
self.set('publication_date', yearmatch.group(0))
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return url.find('10.1007') or url.find('10.1057')
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
def load_springer(num_pages):
def springer_open_books(num_pages):
for page in range(1, num_pages+1):
url = search_url.format(page)
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
base = response.url
doc = BeautifulSoup(response.content, 'lxml')
for link in doc.select('a.title'):
book_url = urljoin(base, link['href'])
yield SpringerScraper(book_url)
return add_from_bookdatas(springer_open_books(num_pages))

View File

@ -0,0 +1,12 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders.springer import load_springer
class Command(BaseCommand):
help = "load books from springer open"
args = "<pages>"
def handle(self, pages, **options):
books = load_springer(int(pages))
print "loaded {} books".format(len(books))

View File

@ -17,10 +17,10 @@ from regluit.core.bookloader import (
add_by_googlebooks_id,
add_by_isbn,
add_by_oclc,
add_by_webpage,
)
from regluit.core.parameters import WORK_IDENTIFIERS
from regluit.core.loaders import add_by_webpage
from regluit.core.loaders.utils import ids_from_urls
from regluit.frontend.forms import EditionForm, IdentifierForm