add springer scraper
parent
5f39729d74
commit
82784778c4
|
@ -38,7 +38,6 @@ from . import cc
|
|||
from . import models
|
||||
from .parameters import WORK_IDENTIFIERS
|
||||
from .validation import identifier_cleaner, unreverse_name
|
||||
from .loaders.scrape import get_scraper, scrape_sitemap
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
request_log = logging.getLogger("requests")
|
||||
|
@ -755,7 +754,7 @@ def edition_for_ident(id_type, id_value):
|
|||
#print 'returning edition for {}: {}'.format(id_type, id_value)
|
||||
for ident in models.Identifier.objects.filter(type=id_type, value=id_value):
|
||||
return ident.edition if ident.edition else ident.work.editions[0]
|
||||
|
||||
|
||||
def edition_for_etype(etype, metadata, default=None):
|
||||
'''
|
||||
assumes the metadata contains the isbn_etype attributes, and that the editions have been created.
|
||||
|
@ -774,7 +773,7 @@ def edition_for_etype(etype, metadata, default=None):
|
|||
return edition_for_ident(key, metadata.identifiers[key])
|
||||
for key in metadata.edition_identifiers.keys():
|
||||
return edition_for_ident(key, metadata.identifiers[key])
|
||||
|
||||
|
||||
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
|
||||
|
||||
def load_ebookfile(url, etype):
|
||||
|
@ -793,14 +792,14 @@ def load_ebookfile(url, etype):
|
|||
logger.error(u'could not open {}'.format(url))
|
||||
except ValidationError, e:
|
||||
logger.error(u'downloaded {} was not a valid {}'.format(url, etype))
|
||||
|
||||
|
||||
class BasePandataLoader(object):
|
||||
def __init__(self, url):
|
||||
self.base_url = url
|
||||
|
||||
def load_from_pandata(self, metadata, work=None):
|
||||
''' metadata is a Pandata object'''
|
||||
|
||||
|
||||
#find an work to associate
|
||||
edition = None
|
||||
has_ed_id = False
|
||||
|
@ -862,7 +861,7 @@ class BasePandataLoader(object):
|
|||
if metadata.description and len(metadata.description) > len(work.description):
|
||||
#be careful about overwriting the work description
|
||||
work.description = metadata.description
|
||||
if metadata.creator and not edition.authors.count():
|
||||
if metadata.creator and not edition.authors.count():
|
||||
edition.authors.clear()
|
||||
for key in metadata.creator.keys():
|
||||
creators = metadata.creator[key]
|
||||
|
@ -901,7 +900,7 @@ class BasePandataLoader(object):
|
|||
contentfile = load_ebookfile(url, key)
|
||||
if contentfile:
|
||||
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
|
||||
path = default_storage.save(contentfile_name, contentfile)
|
||||
path = default_storage.save(contentfile_name, contentfile)
|
||||
lic = MATCH_LICENSE.search(metadata.rights_url)
|
||||
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
|
||||
ebf = models.EbookFile.objects.create(
|
||||
|
@ -923,8 +922,8 @@ class BasePandataLoader(object):
|
|||
)
|
||||
ebf.ebook = ebook
|
||||
ebf.save()
|
||||
|
||||
|
||||
|
||||
|
||||
class GithubLoader(BasePandataLoader):
|
||||
def load_ebooks(self, metadata, edition, test_mode=False):
|
||||
# create Ebook for any ebook in the corresponding GitHub release
|
||||
|
@ -1013,21 +1012,10 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
|
|||
for asset in release.iter_assets()
|
||||
if EBOOK_FORMATS.get(asset.content_type) is not None]
|
||||
|
||||
def add_by_webpage(url, work=None, user=None):
|
||||
edition = None
|
||||
scraper = get_scraper(url)
|
||||
loader = BasePandataLoader(url)
|
||||
pandata = Pandata()
|
||||
pandata.metadata = scraper.metadata
|
||||
for metadata in pandata.get_edition_list():
|
||||
edition = loader.load_from_pandata(metadata, work)
|
||||
work = edition.work
|
||||
loader.load_ebooks(pandata, edition, user=user)
|
||||
return edition if edition else None
|
||||
|
||||
def add_by_sitemap(url, maxnum=None):
|
||||
def add_from_bookdatas(bookdatas):
|
||||
''' bookdatas are iterators of scrapers '''
|
||||
editions = []
|
||||
for bookdata in scrape_sitemap(url, maxnum=maxnum):
|
||||
for bookdata in bookdatas:
|
||||
edition = work = None
|
||||
loader = BasePandataLoader(bookdata.base)
|
||||
pandata = Pandata()
|
||||
|
@ -1039,6 +1027,3 @@ def add_by_sitemap(url, maxnum=None):
|
|||
if edition:
|
||||
editions.append(edition)
|
||||
return editions
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from gitenberg.metadata.pandata import Pandata
|
||||
|
||||
from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader
|
||||
from .scrape import PressbooksScraper, HathitrustScraper, BaseScraper
|
||||
from .springer import SpringerScraper
|
||||
|
||||
def get_scraper(url):
|
||||
scrapers = [PressbooksScraper, HathitrustScraper, BaseScraper]
|
||||
for scraper in scrapers:
|
||||
if scraper.can_scrape(url):
|
||||
return scraper(url)
|
||||
|
||||
def scrape_sitemap(url, maxnum=None):
|
||||
try:
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
doc = BeautifulSoup(response.content, 'lxml')
|
||||
for page in doc.find_all('loc')[0:maxnum]:
|
||||
scraper = get_scraper(page.text)
|
||||
if scraper.metadata.get('genre', None) == 'book':
|
||||
yield scraper
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(e)
|
||||
|
||||
def add_by_webpage(url, work=None, user=None):
|
||||
edition = None
|
||||
scraper = get_scraper(url)
|
||||
loader = BasePandataLoader(url)
|
||||
pandata = Pandata()
|
||||
pandata.metadata = scraper.metadata
|
||||
for metadata in pandata.get_edition_list():
|
||||
edition = loader.load_from_pandata(metadata, work)
|
||||
work = edition.work
|
||||
loader.load_ebooks(pandata, edition, user=user)
|
||||
return edition if edition else None
|
||||
|
||||
|
||||
def add_by_sitemap(url, maxnum=None):
|
||||
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
|
|
@ -230,7 +230,7 @@ class BaseScraper(object):
|
|||
if value:
|
||||
self.set('publication_date', value)
|
||||
|
||||
def get_authors(self):
|
||||
def get_author_list(self):
|
||||
value_list = self.check_metas([
|
||||
'DC.Creator.PersonalName',
|
||||
'citation_author',
|
||||
|
@ -239,9 +239,15 @@ class BaseScraper(object):
|
|||
if not value_list:
|
||||
value_list = self.get_itemprop('author')
|
||||
if not value_list:
|
||||
return
|
||||
return []
|
||||
return value_list
|
||||
|
||||
def get_authors(self):
|
||||
value_list = self.get_author_list()
|
||||
creator_list = []
|
||||
value_list = authlist_cleaner(value_list)
|
||||
if len(value_list) == 0:
|
||||
return
|
||||
if len(value_list) == 1:
|
||||
self.set('creator', {'author': {'agent_name': value_list[0]}})
|
||||
return
|
||||
|
@ -383,21 +389,3 @@ class HathitrustScraper(BaseScraper):
|
|||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return url.find('hathitrust.org') > 0 or url.find('hdl.handle.net/2027/') > 0
|
||||
|
||||
|
||||
def get_scraper(url):
|
||||
scrapers = [PressbooksScraper, HathitrustScraper, BaseScraper]
|
||||
for scraper in scrapers:
|
||||
if scraper.can_scrape(url):
|
||||
return scraper(url)
|
||||
|
||||
def scrape_sitemap(url, maxnum=None):
|
||||
try:
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
doc = BeautifulSoup(response.content, 'lxml')
|
||||
for page in doc.find_all('loc')[0:maxnum]:
|
||||
scraper = get_scraper(page.text)
|
||||
if scraper.metadata.get('genre', None) == 'book':
|
||||
yield scraper
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(e)
|
||||
|
|
|
@ -0,0 +1,118 @@
|
|||
import re
|
||||
import requests
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from urlparse import urljoin
|
||||
from django.conf import settings
|
||||
|
||||
from regluit.core.validation import identifier_cleaner
|
||||
from regluit.core.bookloader import add_from_bookdatas
|
||||
|
||||
from .scrape import BaseScraper, CONTAINS_CC
|
||||
|
||||
MENTIONS_CC = re.compile(r'CC BY(-NC)?(-ND|-SA)?', flags=re.I)
|
||||
HAS_YEAR = re.compile(r'(19|20)\d\d')
|
||||
|
||||
class SpringerScraper(BaseScraper):
|
||||
def get_downloads(self):
|
||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||
download_el = self.doc.find('a', title=re.compile(dl_type.upper()))
|
||||
if download_el:
|
||||
value = download_el.get('href')
|
||||
if value:
|
||||
value = urljoin(self.base, value)
|
||||
self.set('download_url_{}'.format(dl_type), value)
|
||||
|
||||
def get_description(self):
|
||||
desc = self.doc.select_one('#book-description')
|
||||
if desc:
|
||||
value = ''
|
||||
for div in desc.contents:
|
||||
text = div.string.replace(u'\xa0', u' ') if div.string else None
|
||||
if text:
|
||||
value = u'{}<p>{}</p>'.format(value, text)
|
||||
self.set('description', value)
|
||||
|
||||
def get_keywords(self):
|
||||
value = []
|
||||
for kw in self.doc.select('.Keyword'):
|
||||
value.append(kw.text.strip())
|
||||
if value:
|
||||
if 'Open Access' in value:
|
||||
value.remove('Open Access')
|
||||
self.set('subjects', value)
|
||||
|
||||
def get_identifiers(self):
|
||||
super(SpringerScraper, self).get_identifiers()
|
||||
el = self.doc.select_one('#doi-url')
|
||||
if el:
|
||||
value = identifier_cleaner('doi', quiet=True)(el.text)
|
||||
if value:
|
||||
self.identifiers['doi'] = value
|
||||
|
||||
def get_isbns(self):
|
||||
isbns = {}
|
||||
el = self.doc.select_one('#print-isbn')
|
||||
if el:
|
||||
value = identifier_cleaner('isbn', quiet=True)(el.text)
|
||||
if value:
|
||||
isbns['paper'] = value
|
||||
el = self.doc.select_one('#electronic-isbn')
|
||||
if el:
|
||||
value = identifier_cleaner('isbn', quiet=True)(el.text)
|
||||
if value:
|
||||
isbns['electronic'] = value
|
||||
return isbns
|
||||
|
||||
def get_title(self):
|
||||
el = self.doc.select_one('#book-title')
|
||||
if el:
|
||||
value = el.text.strip()
|
||||
if value:
|
||||
value = value.replace('\n', ': ', 1)
|
||||
self.set('title', value)
|
||||
if not value:
|
||||
(SpringerScraper, self).get_title()
|
||||
|
||||
def get_author_list(self):
|
||||
for el in self.doc.select('.authors__name'):
|
||||
yield el.text.strip().replace(u'\xa0', u' ')
|
||||
|
||||
def get_license(self):
|
||||
'''only looks for cc licenses'''
|
||||
links = self.doc.find_all(href=CONTAINS_CC)
|
||||
for link in links:
|
||||
self.set('rights_url', link['href'])
|
||||
return
|
||||
mention = self.doc.find(string=MENTIONS_CC)
|
||||
if mention:
|
||||
lic = MENTIONS_CC.search(mention).group(0)
|
||||
lic_url = 'https://creativecommons.org/licences/{}/'.format(lic[3:].lower())
|
||||
self.set('rights_url', lic_url)
|
||||
|
||||
def get_pubdate(self):
|
||||
pubinfo = self.doc.select_one('#copyright-info')
|
||||
if pubinfo:
|
||||
yearmatch = HAS_YEAR.search(pubinfo.string)
|
||||
if yearmatch:
|
||||
self.set('publication_date', yearmatch.group(0))
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return url.find('10.1007') or url.find('10.1057')
|
||||
|
||||
|
||||
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
|
||||
def load_springer(num_pages):
|
||||
def springer_open_books(num_pages):
|
||||
for page in range(1, num_pages+1):
|
||||
url = search_url.format(page)
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
base = response.url
|
||||
doc = BeautifulSoup(response.content, 'lxml')
|
||||
for link in doc.select('a.title'):
|
||||
book_url = urljoin(base, link['href'])
|
||||
yield SpringerScraper(book_url)
|
||||
return add_from_bookdatas(springer_open_books(num_pages))
|
|
@ -0,0 +1,12 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core.loaders.springer import load_springer
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "load books from springer open"
|
||||
args = "<pages>"
|
||||
|
||||
|
||||
def handle(self, pages, **options):
|
||||
books = load_springer(int(pages))
|
||||
print "loaded {} books".format(len(books))
|
|
@ -17,10 +17,10 @@ from regluit.core.bookloader import (
|
|||
add_by_googlebooks_id,
|
||||
add_by_isbn,
|
||||
add_by_oclc,
|
||||
add_by_webpage,
|
||||
)
|
||||
from regluit.core.parameters import WORK_IDENTIFIERS
|
||||
|
||||
from regluit.core.loaders import add_by_webpage
|
||||
from regluit.core.loaders.utils import ids_from_urls
|
||||
from regluit.frontend.forms import EditionForm, IdentifierForm
|
||||
|
||||
|
|
Loading…
Reference in New Issue