add springer scraper
parent
5f39729d74
commit
82784778c4
|
@ -38,7 +38,6 @@ from . import cc
|
||||||
from . import models
|
from . import models
|
||||||
from .parameters import WORK_IDENTIFIERS
|
from .parameters import WORK_IDENTIFIERS
|
||||||
from .validation import identifier_cleaner, unreverse_name
|
from .validation import identifier_cleaner, unreverse_name
|
||||||
from .loaders.scrape import get_scraper, scrape_sitemap
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
request_log = logging.getLogger("requests")
|
request_log = logging.getLogger("requests")
|
||||||
|
@ -755,7 +754,7 @@ def edition_for_ident(id_type, id_value):
|
||||||
#print 'returning edition for {}: {}'.format(id_type, id_value)
|
#print 'returning edition for {}: {}'.format(id_type, id_value)
|
||||||
for ident in models.Identifier.objects.filter(type=id_type, value=id_value):
|
for ident in models.Identifier.objects.filter(type=id_type, value=id_value):
|
||||||
return ident.edition if ident.edition else ident.work.editions[0]
|
return ident.edition if ident.edition else ident.work.editions[0]
|
||||||
|
|
||||||
def edition_for_etype(etype, metadata, default=None):
|
def edition_for_etype(etype, metadata, default=None):
|
||||||
'''
|
'''
|
||||||
assumes the metadata contains the isbn_etype attributes, and that the editions have been created.
|
assumes the metadata contains the isbn_etype attributes, and that the editions have been created.
|
||||||
|
@ -774,7 +773,7 @@ def edition_for_etype(etype, metadata, default=None):
|
||||||
return edition_for_ident(key, metadata.identifiers[key])
|
return edition_for_ident(key, metadata.identifiers[key])
|
||||||
for key in metadata.edition_identifiers.keys():
|
for key in metadata.edition_identifiers.keys():
|
||||||
return edition_for_ident(key, metadata.identifiers[key])
|
return edition_for_ident(key, metadata.identifiers[key])
|
||||||
|
|
||||||
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
|
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
|
||||||
|
|
||||||
def load_ebookfile(url, etype):
|
def load_ebookfile(url, etype):
|
||||||
|
@ -793,14 +792,14 @@ def load_ebookfile(url, etype):
|
||||||
logger.error(u'could not open {}'.format(url))
|
logger.error(u'could not open {}'.format(url))
|
||||||
except ValidationError, e:
|
except ValidationError, e:
|
||||||
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
|
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
|
||||||
|
|
||||||
class BasePandataLoader(object):
|
class BasePandataLoader(object):
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
self.base_url = url
|
self.base_url = url
|
||||||
|
|
||||||
def load_from_pandata(self, metadata, work=None):
|
def load_from_pandata(self, metadata, work=None):
|
||||||
''' metadata is a Pandata object'''
|
''' metadata is a Pandata object'''
|
||||||
|
|
||||||
#find an work to associate
|
#find an work to associate
|
||||||
edition = None
|
edition = None
|
||||||
has_ed_id = False
|
has_ed_id = False
|
||||||
|
@ -862,7 +861,7 @@ class BasePandataLoader(object):
|
||||||
if metadata.description and len(metadata.description) > len(work.description):
|
if metadata.description and len(metadata.description) > len(work.description):
|
||||||
#be careful about overwriting the work description
|
#be careful about overwriting the work description
|
||||||
work.description = metadata.description
|
work.description = metadata.description
|
||||||
if metadata.creator and not edition.authors.count():
|
if metadata.creator and not edition.authors.count():
|
||||||
edition.authors.clear()
|
edition.authors.clear()
|
||||||
for key in metadata.creator.keys():
|
for key in metadata.creator.keys():
|
||||||
creators = metadata.creator[key]
|
creators = metadata.creator[key]
|
||||||
|
@ -901,7 +900,7 @@ class BasePandataLoader(object):
|
||||||
contentfile = load_ebookfile(url, key)
|
contentfile = load_ebookfile(url, key)
|
||||||
if contentfile:
|
if contentfile:
|
||||||
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
||||||
path = default_storage.save(contentfile_name, contentfile)
|
path = default_storage.save(contentfile_name, contentfile)
|
||||||
lic = MATCH_LICENSE.search(metadata.rights_url)
|
lic = MATCH_LICENSE.search(metadata.rights_url)
|
||||||
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
|
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
|
||||||
ebf = models.EbookFile.objects.create(
|
ebf = models.EbookFile.objects.create(
|
||||||
|
@ -923,8 +922,8 @@ class BasePandataLoader(object):
|
||||||
)
|
)
|
||||||
ebf.ebook = ebook
|
ebf.ebook = ebook
|
||||||
ebf.save()
|
ebf.save()
|
||||||
|
|
||||||
|
|
||||||
class GithubLoader(BasePandataLoader):
|
class GithubLoader(BasePandataLoader):
|
||||||
def load_ebooks(self, metadata, edition, test_mode=False):
|
def load_ebooks(self, metadata, edition, test_mode=False):
|
||||||
# create Ebook for any ebook in the corresponding GitHub release
|
# create Ebook for any ebook in the corresponding GitHub release
|
||||||
|
@ -1013,21 +1012,10 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
|
||||||
for asset in release.iter_assets()
|
for asset in release.iter_assets()
|
||||||
if EBOOK_FORMATS.get(asset.content_type) is not None]
|
if EBOOK_FORMATS.get(asset.content_type) is not None]
|
||||||
|
|
||||||
def add_by_webpage(url, work=None, user=None):
|
def add_from_bookdatas(bookdatas):
|
||||||
edition = None
|
''' bookdatas are iterators of scrapers '''
|
||||||
scraper = get_scraper(url)
|
|
||||||
loader = BasePandataLoader(url)
|
|
||||||
pandata = Pandata()
|
|
||||||
pandata.metadata = scraper.metadata
|
|
||||||
for metadata in pandata.get_edition_list():
|
|
||||||
edition = loader.load_from_pandata(metadata, work)
|
|
||||||
work = edition.work
|
|
||||||
loader.load_ebooks(pandata, edition, user=user)
|
|
||||||
return edition if edition else None
|
|
||||||
|
|
||||||
def add_by_sitemap(url, maxnum=None):
|
|
||||||
editions = []
|
editions = []
|
||||||
for bookdata in scrape_sitemap(url, maxnum=maxnum):
|
for bookdata in bookdatas:
|
||||||
edition = work = None
|
edition = work = None
|
||||||
loader = BasePandataLoader(bookdata.base)
|
loader = BasePandataLoader(bookdata.base)
|
||||||
pandata = Pandata()
|
pandata = Pandata()
|
||||||
|
@ -1039,6 +1027,3 @@ def add_by_sitemap(url, maxnum=None):
|
||||||
if edition:
|
if edition:
|
||||||
editions.append(edition)
|
editions.append(edition)
|
||||||
return editions
|
return editions
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from gitenberg.metadata.pandata import Pandata
|
||||||
|
|
||||||
|
from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader
|
||||||
|
from .scrape import PressbooksScraper, HathitrustScraper, BaseScraper
|
||||||
|
from .springer import SpringerScraper
|
||||||
|
|
||||||
|
def get_scraper(url):
|
||||||
|
scrapers = [PressbooksScraper, HathitrustScraper, BaseScraper]
|
||||||
|
for scraper in scrapers:
|
||||||
|
if scraper.can_scrape(url):
|
||||||
|
return scraper(url)
|
||||||
|
|
||||||
|
def scrape_sitemap(url, maxnum=None):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||||
|
doc = BeautifulSoup(response.content, 'lxml')
|
||||||
|
for page in doc.find_all('loc')[0:maxnum]:
|
||||||
|
scraper = get_scraper(page.text)
|
||||||
|
if scraper.metadata.get('genre', None) == 'book':
|
||||||
|
yield scraper
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
def add_by_webpage(url, work=None, user=None):
|
||||||
|
edition = None
|
||||||
|
scraper = get_scraper(url)
|
||||||
|
loader = BasePandataLoader(url)
|
||||||
|
pandata = Pandata()
|
||||||
|
pandata.metadata = scraper.metadata
|
||||||
|
for metadata in pandata.get_edition_list():
|
||||||
|
edition = loader.load_from_pandata(metadata, work)
|
||||||
|
work = edition.work
|
||||||
|
loader.load_ebooks(pandata, edition, user=user)
|
||||||
|
return edition if edition else None
|
||||||
|
|
||||||
|
|
||||||
|
def add_by_sitemap(url, maxnum=None):
|
||||||
|
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
|
|
@ -230,7 +230,7 @@ class BaseScraper(object):
|
||||||
if value:
|
if value:
|
||||||
self.set('publication_date', value)
|
self.set('publication_date', value)
|
||||||
|
|
||||||
def get_authors(self):
|
def get_author_list(self):
|
||||||
value_list = self.check_metas([
|
value_list = self.check_metas([
|
||||||
'DC.Creator.PersonalName',
|
'DC.Creator.PersonalName',
|
||||||
'citation_author',
|
'citation_author',
|
||||||
|
@ -239,9 +239,15 @@ class BaseScraper(object):
|
||||||
if not value_list:
|
if not value_list:
|
||||||
value_list = self.get_itemprop('author')
|
value_list = self.get_itemprop('author')
|
||||||
if not value_list:
|
if not value_list:
|
||||||
return
|
return []
|
||||||
|
return value_list
|
||||||
|
|
||||||
|
def get_authors(self):
|
||||||
|
value_list = self.get_author_list()
|
||||||
creator_list = []
|
creator_list = []
|
||||||
value_list = authlist_cleaner(value_list)
|
value_list = authlist_cleaner(value_list)
|
||||||
|
if len(value_list) == 0:
|
||||||
|
return
|
||||||
if len(value_list) == 1:
|
if len(value_list) == 1:
|
||||||
self.set('creator', {'author': {'agent_name': value_list[0]}})
|
self.set('creator', {'author': {'agent_name': value_list[0]}})
|
||||||
return
|
return
|
||||||
|
@ -383,21 +389,3 @@ class HathitrustScraper(BaseScraper):
|
||||||
def can_scrape(cls, url):
|
def can_scrape(cls, url):
|
||||||
''' return True if the class can scrape the URL '''
|
''' return True if the class can scrape the URL '''
|
||||||
return url.find('hathitrust.org') > 0 or url.find('hdl.handle.net/2027/') > 0
|
return url.find('hathitrust.org') > 0 or url.find('hdl.handle.net/2027/') > 0
|
||||||
|
|
||||||
|
|
||||||
def get_scraper(url):
|
|
||||||
scrapers = [PressbooksScraper, HathitrustScraper, BaseScraper]
|
|
||||||
for scraper in scrapers:
|
|
||||||
if scraper.can_scrape(url):
|
|
||||||
return scraper(url)
|
|
||||||
|
|
||||||
def scrape_sitemap(url, maxnum=None):
|
|
||||||
try:
|
|
||||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
|
||||||
doc = BeautifulSoup(response.content, 'lxml')
|
|
||||||
for page in doc.find_all('loc')[0:maxnum]:
|
|
||||||
scraper = get_scraper(page.text)
|
|
||||||
if scraper.metadata.get('genre', None) == 'book':
|
|
||||||
yield scraper
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
logger.error(e)
|
|
||||||
|
|
|
@ -0,0 +1,118 @@
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urlparse import urljoin
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from regluit.core.validation import identifier_cleaner
|
||||||
|
from regluit.core.bookloader import add_from_bookdatas
|
||||||
|
|
||||||
|
from .scrape import BaseScraper, CONTAINS_CC
|
||||||
|
|
||||||
|
MENTIONS_CC = re.compile(r'CC BY(-NC)?(-ND|-SA)?', flags=re.I)
|
||||||
|
HAS_YEAR = re.compile(r'(19|20)\d\d')
|
||||||
|
|
||||||
|
class SpringerScraper(BaseScraper):
|
||||||
|
def get_downloads(self):
|
||||||
|
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||||
|
download_el = self.doc.find('a', title=re.compile(dl_type.upper()))
|
||||||
|
if download_el:
|
||||||
|
value = download_el.get('href')
|
||||||
|
if value:
|
||||||
|
value = urljoin(self.base, value)
|
||||||
|
self.set('download_url_{}'.format(dl_type), value)
|
||||||
|
|
||||||
|
def get_description(self):
|
||||||
|
desc = self.doc.select_one('#book-description')
|
||||||
|
if desc:
|
||||||
|
value = ''
|
||||||
|
for div in desc.contents:
|
||||||
|
text = div.string.replace(u'\xa0', u' ') if div.string else None
|
||||||
|
if text:
|
||||||
|
value = u'{}<p>{}</p>'.format(value, text)
|
||||||
|
self.set('description', value)
|
||||||
|
|
||||||
|
def get_keywords(self):
|
||||||
|
value = []
|
||||||
|
for kw in self.doc.select('.Keyword'):
|
||||||
|
value.append(kw.text.strip())
|
||||||
|
if value:
|
||||||
|
if 'Open Access' in value:
|
||||||
|
value.remove('Open Access')
|
||||||
|
self.set('subjects', value)
|
||||||
|
|
||||||
|
def get_identifiers(self):
|
||||||
|
super(SpringerScraper, self).get_identifiers()
|
||||||
|
el = self.doc.select_one('#doi-url')
|
||||||
|
if el:
|
||||||
|
value = identifier_cleaner('doi', quiet=True)(el.text)
|
||||||
|
if value:
|
||||||
|
self.identifiers['doi'] = value
|
||||||
|
|
||||||
|
def get_isbns(self):
|
||||||
|
isbns = {}
|
||||||
|
el = self.doc.select_one('#print-isbn')
|
||||||
|
if el:
|
||||||
|
value = identifier_cleaner('isbn', quiet=True)(el.text)
|
||||||
|
if value:
|
||||||
|
isbns['paper'] = value
|
||||||
|
el = self.doc.select_one('#electronic-isbn')
|
||||||
|
if el:
|
||||||
|
value = identifier_cleaner('isbn', quiet=True)(el.text)
|
||||||
|
if value:
|
||||||
|
isbns['electronic'] = value
|
||||||
|
return isbns
|
||||||
|
|
||||||
|
def get_title(self):
|
||||||
|
el = self.doc.select_one('#book-title')
|
||||||
|
if el:
|
||||||
|
value = el.text.strip()
|
||||||
|
if value:
|
||||||
|
value = value.replace('\n', ': ', 1)
|
||||||
|
self.set('title', value)
|
||||||
|
if not value:
|
||||||
|
(SpringerScraper, self).get_title()
|
||||||
|
|
||||||
|
def get_author_list(self):
|
||||||
|
for el in self.doc.select('.authors__name'):
|
||||||
|
yield el.text.strip().replace(u'\xa0', u' ')
|
||||||
|
|
||||||
|
def get_license(self):
|
||||||
|
'''only looks for cc licenses'''
|
||||||
|
links = self.doc.find_all(href=CONTAINS_CC)
|
||||||
|
for link in links:
|
||||||
|
self.set('rights_url', link['href'])
|
||||||
|
return
|
||||||
|
mention = self.doc.find(string=MENTIONS_CC)
|
||||||
|
if mention:
|
||||||
|
lic = MENTIONS_CC.search(mention).group(0)
|
||||||
|
lic_url = 'https://creativecommons.org/licences/{}/'.format(lic[3:].lower())
|
||||||
|
self.set('rights_url', lic_url)
|
||||||
|
|
||||||
|
def get_pubdate(self):
|
||||||
|
pubinfo = self.doc.select_one('#copyright-info')
|
||||||
|
if pubinfo:
|
||||||
|
yearmatch = HAS_YEAR.search(pubinfo.string)
|
||||||
|
if yearmatch:
|
||||||
|
self.set('publication_date', yearmatch.group(0))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def can_scrape(cls, url):
|
||||||
|
''' return True if the class can scrape the URL '''
|
||||||
|
return url.find('10.1007') or url.find('10.1057')
|
||||||
|
|
||||||
|
|
||||||
|
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
|
||||||
|
def load_springer(num_pages):
|
||||||
|
def springer_open_books(num_pages):
|
||||||
|
for page in range(1, num_pages+1):
|
||||||
|
url = search_url.format(page)
|
||||||
|
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||||
|
if response.status_code == 200:
|
||||||
|
base = response.url
|
||||||
|
doc = BeautifulSoup(response.content, 'lxml')
|
||||||
|
for link in doc.select('a.title'):
|
||||||
|
book_url = urljoin(base, link['href'])
|
||||||
|
yield SpringerScraper(book_url)
|
||||||
|
return add_from_bookdatas(springer_open_books(num_pages))
|
|
@ -0,0 +1,12 @@
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from regluit.core.loaders.springer import load_springer
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "load books from springer open"
|
||||||
|
args = "<pages>"
|
||||||
|
|
||||||
|
|
||||||
|
def handle(self, pages, **options):
|
||||||
|
books = load_springer(int(pages))
|
||||||
|
print "loaded {} books".format(len(books))
|
|
@ -17,10 +17,10 @@ from regluit.core.bookloader import (
|
||||||
add_by_googlebooks_id,
|
add_by_googlebooks_id,
|
||||||
add_by_isbn,
|
add_by_isbn,
|
||||||
add_by_oclc,
|
add_by_oclc,
|
||||||
add_by_webpage,
|
|
||||||
)
|
)
|
||||||
from regluit.core.parameters import WORK_IDENTIFIERS
|
from regluit.core.parameters import WORK_IDENTIFIERS
|
||||||
|
|
||||||
|
from regluit.core.loaders import add_by_webpage
|
||||||
from regluit.core.loaders.utils import ids_from_urls
|
from regluit.core.loaders.utils import ids_from_urls
|
||||||
from regluit.frontend.forms import EditionForm, IdentifierForm
|
from regluit.frontend.forms import EditionForm, IdentifierForm
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue