Merge branch 'master' into 2023-final

pull/94/head
eric 2023-12-28 14:12:30 -05:00
commit 1b3a580dfb
8 changed files with 425 additions and 135 deletions

View File

@ -59,42 +59,47 @@ doab_reader = MetadataReader(
STOREPROVIDERS = [
'7switch.com',
"amazon.ca",
"amazon.co.uk",
"amazon.com",
"amazon.de",
"amzn.to",
"apress.com",
'amazon.ca',
'amazon.co.uk',
'amazon.com',
'amazon.de',
'amzn.to',
'apress.com',
'bloomsbury.com',
"bod.de",
"cabi.org",
"cdcshoppingcart.uchicago.edu",
"checkout.sas.ac.uk",
'bod.de',
'cabi.org',
'cdcshoppingcart.uchicago.edu',
'checkout.sas.ac.uk',
'duncker-humblot.de',
"dykinson.com",
'dykinson.com',
'e-elgar.com',
"edicions.ub.edu",
"epubli.de",
"iospress.nl",
"karolinum.cz",
"librumstore.com",
"logos-verlag.de",
"mitpress.mit.edu",
"munishop.muni.cz",
"nomos-shop.de",
"palgrave.com",
'edicions.ub.edu',
'epubli.de',
'eurekaselect.com',
'global.oup.com',
'iospress.nl',
'karolinum.cz',
'librumstore.com',
'logos-verlag.de',
'mitpress.mit.edu',
'munishop.muni.cz',
'nomos-shop.de',
'palgrave.com',
'placedeslibraires.fr',
"play.google.com",
"press.umich.edu",
"pressesuniversitairesdeliege.be",
"publicacions.ub.edu",
"publicacions.urv.cat",
'play.google.com',
'press.umich.edu',
'pressesuniversitairesdeliege.be',
'publicacions.ub.edu',
'publicacions.urv.cat',
'sci.fo',
"una-editions.fr",
"universitetsforlaget.no",
'schueren-verlag.de',
'store.printservice.nl',
'una-editions.fr',
'universitetsforlaget.no',
'usu.edu',
"wbg-wissenverbindet.de",
"zalozba.zrc-sazu.si",
'urldefense.com',
'wbg-wissenverbindet.de',
'zalozba.zrc-sazu.si',
]
def online_to_download(url):

View File

@ -1,6 +1,7 @@
"""
code for harvesting 'online' ebooks
"""
import json
import logging
import re
import time
@ -10,6 +11,7 @@ import requests
from django.conf import settings
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from regluit.core import models
from regluit.core.models import loader
@ -23,7 +25,6 @@ logger = logging.getLogger(__name__)
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
DELAY = 1.0
OPENBOOKPUB = re.compile(r'openbookpublishers.com/+(reader|product|/?download/book)/(\d+)')
class RateLimiter(object):
def __init__(self):
@ -40,6 +41,7 @@ class RateLimiter(object):
rl = RateLimiter()
def set_bookshop(ebook):
ebook.format = 'bookshop'
ebook.save()
@ -62,6 +64,7 @@ def dl_online(ebook, limiter=rl.delay, format='online', force=False):
return harvester(ebook)
return None, 0
def archive_dl(ebook, limiter=rl.delay, force=False):
""" status codes
0 : archive exists
@ -87,6 +90,7 @@ def archive_dl(ebook, limiter=rl.delay, force=False):
status = -1
return status
def clean_archive(ebf):
fsize = ebf.ebook.filesize
ebook = ebf.ebook
@ -110,9 +114,11 @@ def clean_archive(ebf):
CMPPROVIDERS = [
'books.open.tudelft.nl',
'ebooks.epublishing.ekt.gr',
'ebooks.uminho.pt',
'editorial.inudi.edu.pe',
'editorial.ucatolicaluisamigo.edu.co',
'editorial.uniagustiniana.edu.co',
'fcjp.derecho.unap.edu.pe',
'fedoabooks.unina.it',
'humanities-digital-library.org',
'idicap.com',
@ -124,6 +130,7 @@ CMPPROVIDERS = [
'omp.zrc-sazu.si',
'openpress.mtsu.edu',
'teiresias-supplements.mcgill.ca',
'textbooks.open.tudelft.nl',
]
DONT_HARVEST = [
'Unglue.it',
@ -132,17 +139,28 @@ DONT_HARVEST = [
'Google Books',
'OpenEdition Books',
]
MANUAL_HARVEST = [
'cabidigitallibrary.org',
'books.google.be',
'books.google.ch',
'books.google.nl',
]
def harvesters(ebook):
yield ebook.provider == 'OAPEN Library', harvest_oapen
yield ebook.provider in GOOD_PROVIDERS, harvest_generic
yield ebook.provider in MANUAL_HARVEST, harvest_manual
yield 'dropbox.com/s/' in ebook.url, harvest_dropbox
yield ebook.provider == 'jbe-platform.com', harvest_jbe
yield ebook.provider == u'De Gruyter Online', harvest_degruyter
yield OPENBOOKPUB.search(ebook.url), harvest_obp
yield ebook.provider == 'Open Book Publishers', harvest_obp
yield ebook.provider == 'Transcript-Verlag', harvest_transcript
yield ebook.provider == 'shop.budrich.de', harvest_budrich
yield ebook.provider == 'ksp.kit.edu', harvest_ksp
yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis
yield ebook.provider in ['repositorio.americana.edu.co'], harvest_dspace2
yield ebook.provider == 'nomos-elibrary.de', harvest_nomos
yield ebook.provider == 'digitalis.uc.pt', harvest_digitalis
yield 'frontiersin.org' in ebook.provider, harvest_frontiersin
yield ebook.provider in ['Palgrave Connect', 'Springer', 'springer.com'], harvest_springerlink
yield ebook.provider == 'pulp.up.ac.za', harvest_pulp
@ -172,15 +190,17 @@ def harvesters(ebook):
yield ebook.provider == 'laboutique.edpsciences.fr', harvest_edpsciences
yield ebook.provider == 'waxmann.com', harvest_waxmann
yield ebook.provider == 'pbsociety.org.pl', harvest_ojs
yield ebook.provider == 'content.sciendo.com', harvest_sciendo
yield 'sciendo.com' in ebook.provider, harvest_sciendo
yield ebook.provider == 'edition-topoi.org', harvest_topoi
yield ebook.provider == 'meson.press', harvest_meson
yield 'brillonline' in ebook.provider, harvest_brill
yield ebook.provider == 'meson.press', harvest_meson
yield 'brill' in ebook.provider, harvest_brill
yield ebook.provider == 'DOI Resolver', harvest_doi
yield ebook.provider == 'apps.crossref.org', harvest_doi_coaccess
yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab
yield ebook.provider in ['apps.crossref.org', 'mr.crossref.org'], harvest_doi_coaccess
yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab
yield ebook.provider == 'libros.uchile.cl', harvest_libroschile
yield ebook.provider == 'smithsonian.figshare.com', harvest_figshare
yield ebook.provider == 'fupress.com', harvest_fupress
yield ebook.provider == 'funlam.edu.co', harvest_funlam
yield ebook.provider == 'elibrary.duncker-humblot.com', harvest_dunckerhumblot
yield ebook.provider == 'cornellopen.org', harvest_cornellopen
yield ebook.provider == 'esv.info', harvest_esv
@ -195,11 +215,16 @@ def harvesters(ebook):
yield ebook.provider == 'usmcu.edu', harvest_usmcu
yield ebook.provider == 'lalibreria.upv.es', harvest_upv
yield ebook.provider == 'cambridge.org', harvest_cambridge
yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_iupress
yield ebook.provider == 'exonpublications.com', harvest_exon
yield ebook.provider == 'ressources.una-editions.fr', harvest_una
yield ebook.provider == 'wbg-wissenverbindet.de', harvest_wbg
yield ebook.provider == 'urn.kb.se', harvest_kb
yield ebook.provider == 'publikationen.bibliothek.kit.edu', harvest_kit
yield ebook.provider == 'iupress.istanbul.edu.tr', harvest_istanbul
yield ebook.provider == 'editorialbonaventuriana.usb.edu.co', harvest_editorialbonaventuriana
yield ebook.provider == 'verlag.gta.arch.ethz.ch', harvest_gta
yield ebook.provider == 'manchesteruniversitypress.co.uk', harvest_manu
def ebf_if_harvested(url):
onlines = models.EbookFile.objects.filter(source=url)
@ -229,7 +254,6 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET', ver
logger.info("Previously harvested")
return new_ebf, 0
dl_cf, fmt = loader.load_ebookfile(url, ebook.format,
user_agent=user_agent, method=method, verify=verify)
if dl_cf:
@ -238,6 +262,7 @@ def make_dl_ebook(url, ebook, user_agent=settings.USER_AGENT, method='GET', ver
logger.warning('download format %s for %s is not ebook', ebook.format, url)
return None, 0
def redirect_ebook(ebook):
""" returns an ebook and status :
-3 : bad return code or problem
@ -245,7 +270,7 @@ def redirect_ebook(ebook):
-2 : dead, but we need to keep items
0 : replaced with existing
1 : url updated
"""
try:
r = requests.head(ebook.url, allow_redirects=True)
@ -253,7 +278,7 @@ def redirect_ebook(ebook):
logger.error("Connection refused for %s", url)
logger.error(e)
return ebook, -3
if r.status_code == 404:
if not models.Ebook.ebook_files.exists():
logger.info('deleting ebook for dead url', ebook.url)
@ -271,12 +296,14 @@ def redirect_ebook(ebook):
logger.error("status code %s for %s", r.status_code, ebook.url)
return ebook, -3
def make_stapled_ebook(urllist, ebook, user_agent=settings.USER_AGENT, strip_covers=False):
pdffile = staple_pdf(urllist, user_agent, strip_covers=strip_covers)
if not pdffile:
return None, 0
return make_harvested_ebook(ContentFile(pdffile.getvalue()), ebook, 'pdf')
def make_harvested_ebook(content, ebook, format, filesize=0):
if not filesize:
filesize = len(content)
@ -308,11 +335,12 @@ def make_harvested_ebook(content, ebook, format, filesize=0):
ebook.filesize = filesize if filesize < 2147483647 else 2147483647
ebook.save()
harvested_ebook = ebook
new_ebf.ebook = harvested_ebook
new_ebf.save()
return new_ebf, 1
def is_bookshop_url(url):
if '/prodotto/' in url:
return True
@ -320,12 +348,53 @@ def is_bookshop_url(url):
return True
return False
def harvest_generic(ebook):
if is_bookshop_url(ebook.url):
return set_bookshop(ebook)
return set_bookshop(ebook)
return make_dl_ebook(ebook.url, ebook)
def harvest_manual(ebook):
def make_manual_ebf(format):
fname = f'mebf/{ebook.id}.{format}'
if default_storage.exists(fname):
filesize = default_storage.size(fname)
new_ebf = models.EbookFile.objects.create(
edition=ebook.edition,
format=format,
source=ebook.url,
)
new_ebf.file.name = fname
harvested_ebook = models.Ebook.objects.create(
edition=ebook.edition,
format=format,
provider='Unglue.it',
url=new_ebf.file.url,
rights=ebook.rights,
filesize=filesize,
version_label=ebook.version_label,
version_iter=ebook.version_iter,
)
new_ebf.ebook = harvested_ebook
new_ebf.save()
return new_ebf
else:
return None
pdf_ebf = make_manual_ebf('pdf')
epub_ebf = make_manual_ebf('epub')
return pdf_ebf or epub_ebf, (1 if pdf_ebf else 0) + (1 if epub_ebf else 0)
def harvest_oapen(ebook):
if is_bookshop_url(ebook.url):
return set_bookshop(ebook)
if '/bitstream/' in ebook.url:
return make_dl_ebook(ebook.url, ebook, user_agent=settings.GOOGLEBOT_UA)
return None, 0
def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT):
doc = get_soup(ebook.url, user_agent=user_agent, follow_redirects=True)
if doc:
@ -350,7 +419,7 @@ def harvest_one_generic(ebook, selector, user_agent=settings.USER_AGENT):
def harvest_multiple_generic(ebook, selector, dl=lambda x:x):
num = 0
harvested = None
doc = get_soup(ebook.url)
doc = get_soup(ebook.url, follow_redirects=True)
if doc:
found = []
try:
@ -380,7 +449,7 @@ def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0,
except:
base = ebook.url
made = None
# check for complete ebook
if selector:
obj = selector(doc)
@ -405,9 +474,13 @@ def harvest_stapled_generic(ebook, selector, chap_selector, strip_covers=0,
return None, 0
def harvest_obp(ebook):
OPENBOOKPUB = re.compile(r'openbookpublishers.com/+(reader|product|/?download/book|books)/(10\.11647/OBP\.\d+|\d+)')
def harvest_obp(ebook):
match = OPENBOOKPUB.search(ebook.url)
booknum = None
if not match:
return None, 0
if match and match.group(1) in ('product', 'reader'):
prodnum = match.group(2)
prod_url = 'https://www.openbookpublishers.com/product/{}'.format(prodnum)
@ -420,6 +493,9 @@ def harvest_obp(ebook):
booknum = OPENBOOKPUB.search(booknum).group(2)
else:
logger.warning('couldn\'t get soup for %s', prod_url)
elif match and match.group(2).startswith('10.'):
dl_url = 'https://books.openbookpublishers.com/' + match.group(2).lower() + '.pdf'
return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA)
else:
booknum = match.group(2)
if not booknum:
@ -429,6 +505,7 @@ def harvest_obp(ebook):
made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA, method='POST')
return made
DEGRUYTERFULL = re.compile(r'/downloadpdf/title/.*')
DEGRUYTERCHAP = re.compile(r'/downloadpdf/book/.*')
COMPLETE = re.compile(r'complete ebook', flags=re.I)
@ -453,7 +530,7 @@ def harvest_degruyter(ebook):
if obj:
dl_url = urljoin(base, obj['href'])
harvested, made = make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA)
# check for pdf
obj = doc.select_one('a.downloadPdf')
if obj:
@ -485,6 +562,7 @@ def harvest_degruyter(ebook):
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_dropbox(ebook):
if ebook.url.find(u'dl=0') >= 0:
dl_url = ebook.url.replace(u'dl=0', u'dl=1')
@ -501,14 +579,16 @@ def harvest_dropbox(ebook):
logger.warning('couldn\'t get %s', ebook.url)
else:
logger.warning('couldn\'t get dl for %s', ebook.url)
return None, 0
def harvest_jbe(ebook):
return None, 0
def harvest_jbe(ebook):
def selector(doc):
return doc.select('div.access-options a[href]')
return harvest_multiple_generic(ebook, selector)
def harvest_transcript(ebook):
def harvest_transcript(ebook):
num = 0
harvested = None
doc = get_soup(ebook.url)
@ -523,27 +603,33 @@ def harvest_transcript(ebook):
logger.warning('couldn\'t get any dl_url for %s', ebook.url)
return harvested, num
def harvest_ksp(ebook):
def harvest_ksp(ebook):
def selector(doc):
return doc.select_one('p.linkForPDF a')
return harvest_one_generic(ebook, selector)
def harvest_digitalis(ebook):
doc = get_soup(ebook.url)
if doc:
obj = doc.find('meta', attrs={"name": "citation_pdf_url"})
if obj:
dl_url = urljoin(ebook.url, obj.get('content', None))
if dl_url:
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_digitalis(ebook):
def selector(doc):
return doc.select_one('a.item-download-button')
return harvest_one_generic(ebook, selector)
def harvest_kit(ebook):
def selector(doc):
return doc.select_one('a.downloadTextLink')
return harvest_one_generic(ebook, selector)
def harvest_budrich(ebook):
def selector(doc):
return doc.select_one('a.download_pdf')
return harvest_one_generic(ebook, selector)
NOMOSPDF = re.compile('download_full_pdf')
def harvest_nomos(ebook):
def harvest_nomos(ebook):
doc = get_soup(ebook.url, follow_redirects=True)
try:
base = doc.find('base')['href']
@ -560,7 +646,7 @@ def harvest_nomos(ebook):
# staple the chapters
chaps = doc.select('li.access[data-doi]')
pdflinks = []
for chap in chaps:
link = urljoin(
@ -580,19 +666,16 @@ def harvest_nomos(ebook):
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_frontiersin(ebook):
def harvest_frontiersin(ebook):
if 'GetFile.aspx' in ebook.url:
ebook.delete()
rl.last.pop(ebook.provider, 0)
return None, 0
if ebook.provider == 'journal.frontiersin.org':
ebook, status = redirect_ebook(ebook)
if status < 1:
return None, -1 if status < 0 else 0
num = 0
harvested = None
doc = get_soup(ebook.url)
doc = get_soup(ebook.url, follow_redirects=True)
if doc:
for obj in doc.select('button[data-href]'):
dl_url = obj['data-href']
@ -606,9 +689,10 @@ def harvest_frontiersin(ebook):
logger.warning('couldn\'t get any dl_url for %s', ebook.url)
return harvested, num
SPRINGERDL = re.compile(r'(EPUB|PDF)')
def harvest_springerlink(ebook):
def harvest_springerlink(ebook):
def selector(doc):
return doc.find_all('a', title=SPRINGERDL)
if ebook.provider == "springer.com":
@ -671,6 +755,7 @@ def harvest_bloomsbury(ebook):
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_athabasca(ebook):
def selector(doc):
return doc.select_one('li.downloadPDF a[href]')
@ -685,21 +770,41 @@ def harvest_usu(ebook):
def harvest_fahce(ebook):
def selector(doc):
return doc.select_one('div.publicationFormatLink a[href]')
return doc.select_one('div.pub_format_single a[href]')
return harvest_one_generic(ebook, selector)
def get_meta(doc, term):
obj = doc.find('meta', attrs={"name": term})
if obj:
return obj.get('content', None)
else:
logger.warning(f'no meta for {term}')
BAD_CERTS = {'libri.unimi.it', 'editorial.ucatolicaluisamigo.edu.co', 'openpress.mtsu.edu'}
def harvest_cmp(ebook):
def selector(doc):
objs = doc.select('.chapters a.cmp_download_link[href]')
if (len({obj['href'] for obj in objs})) > 1:
return []
return doc.select('a.cmp_download_link[href]')
citation_pdf_url = get_meta(doc, "citation_pdf_url")
citation_epub_url = get_meta(doc, "citation_epub_url")
if citation_pdf_url or citation_epub_url:
if citation_pdf_url:
yield {'href': citation_pdf_url}
if citation_epub_url:
yield {'href': citation_epub_url}
else:
objs = doc.select('.chapters a.cmp_download_link[href]')
if (len({obj['href'] for obj in objs})) > 1:
return []
return doc.select('a.cmp_download_link[href]')
def dl(url):
return url.replace('view', 'download') + '?inline=1'
verify = ebook.provider not in BAD_CERTS
if '/view/' in ebook.url:
return make_dl_ebook(dl(ebook.url), ebook, verify=verify)
return harvest_multiple_generic(ebook, selector, dl=dl)
@ -710,6 +815,21 @@ def harvest_dspace(ebook):
return harvest_one_generic(ebook, selector)
def harvest_dspace2(ebook):
doc = get_soup(ebook.url)
if doc:
citation_pdf_url = get_meta(doc, "citation_pdf_url")
if citation_pdf_url:
dl_url = urljoin(ebook.url, citation_pdf_url)
dl_url = dl_url.replace('http://', 'https://')
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for %s', ebook.url)
else:
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
# won't harvest page-image books
def harvest_unt(ebook):
def selector(doc):
@ -731,7 +851,10 @@ def harvest_mdpi(ebook):
return harvest_one_generic(ebook, selector)
def harvest_idunn(ebook):
def harvest_idunn(ebook):
if '/doi/book/' in ebook.url:
return harvest_manual(ebook)
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('#accessinfo[data-product-id]')
@ -761,13 +884,14 @@ def harvest_muse(ebook):
return doc.find_all('a', href=re.compile(r'/chapter/\d+/pdf'))
return harvest_stapled_generic(ebook, None, chap_selector, strip_covers=1)
def harvest_mitpress(ebook):
def chap_selector(doc):
return doc.select('a.section-pdfLink[href]')
return harvest_stapled_generic(ebook, None, chap_selector, strip_covers=0)
def harvest_ios(ebook):
def harvest_ios(ebook):
booknum = None
doc = get_soup(ebook.url)
if doc:
@ -787,9 +911,15 @@ def harvest_ios(ebook):
def harvest_elgar(ebook):
def chap_selector(doc):
return doc.select('#toc li.pdfLink a[href]')
return harvest_stapled_generic(ebook, None, chap_selector)
if 'display' in ebook.url:
url = ebook.url.replace('display', 'downloadpdf')[:-3] + 'pdf'
elif 'monobook-oa' in ebook.url:
url = ebook.url.replace('monobook-oa', 'downloadpdf')[:-3] + 'pdf'
elif 'edcollbook-oa' in ebook.url:
url = ebook.url.replace('edcollbook-oa', 'downloadpdf')[:-3] + 'pdf'
else:
return None, 0
return make_dl_ebook(url, ebook, user_agent=settings.GOOGLEBOT_UA)
def harvest_wsp(ebook):
@ -799,14 +929,17 @@ def harvest_wsp(ebook):
return make_dl_ebook(url, ebook, user_agent=settings.CHROME_UA)
return None, 0
def harvest_mprl(ebook):
def harvest_mprl(ebook):
def selector(doc):
return doc.select('a.ml-20[href]')
return harvest_multiple_generic(ebook, selector)
def harvest_rti(ebook):
return make_dl_ebook(ebook.url + "/fulltext.pdf", ebook)
def selector(doc):
return doc.find('a', href=re.compile('fulltext.pdf'))
return harvest_one_generic(ebook, selector)
def harvest_unibas(ebook):
@ -814,6 +947,7 @@ def harvest_unibas(ebook):
return doc.select_one('a.ep_document_link[href]')
return harvest_one_generic(ebook, selector)
PENSOFT = re.compile(r'/book/(\d+)/list/')
def harvest_pensoft(ebook):
if ebook.id == 263395:
@ -837,6 +971,8 @@ def harvest_pensoft(ebook):
def harvest_edp(ebook):
def selector(doc):
return doc.select_one('a.fulldl[href]')
if ebook.url.endswith('.pdf'):
return harvest_generic(ebook)
return harvest_one_generic(ebook, selector)
@ -848,7 +984,7 @@ def harvest_edpsciences(ebook):
def harvest_waxmann(ebook):
if ebook.url.startswith('https://www.waxmann.com/buch'):
return make_dl_ebook(ebook.url.replace('buch', 'index.php?eID=download&buchnr='), ebook)
return make_dl_ebook(ebook.url.replace('buch', 'index.php?eID=download&buchnr='), ebook)
return None, 0
@ -860,19 +996,13 @@ def harvest_ojs(ebook):
return harvest_multiple_generic(ebook, selector, dl=dl)
def harvest_sciendo(ebook):
def selector(doc):
return doc.select_one('a[title=PDF]')
return harvest_one_generic(ebook, selector, user_agent=settings.GOOGLEBOT_UA)
def harvest_topoi(ebook):
def harvest_topoi(ebook):
def selector(doc):
return doc.select_one('li.pdf a[href]')
return harvest_one_generic(ebook, selector)
def harvest_meson(ebook):
def harvest_meson(ebook):
def selector(doc):
for btn in doc.select('a[href] btn.btn-openaccess'):
yield btn.parent
@ -881,11 +1011,18 @@ def harvest_meson(ebook):
def harvest_brill(ebook):
r = requests.get(ebook.url, headers={'User-Agent': settings.GOOGLEBOT_UA})
if not r.url.startswith('https://brill.com/view/title/'):
return None, 0
dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[29:]
return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA)
if r.url.startswith('https://brill.com/view/title/'):
dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[29:]
return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA)
elif r.url.startswith('https://brill.com/display/title/'):
dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[32:]
return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA)
elif r.url.startswith('https://brill.com/edcollbook-oa/title/'):
dl_url = 'https://brill.com/downloadpdf/title/%s.pdf' % r.url[38:]
return make_dl_ebook(dl_url, ebook, user_agent=settings.GOOGLEBOT_UA)
return None, 0
def harvest_doi(ebook):
# usually a 404.
ebook, status = redirect_ebook(ebook)
@ -893,6 +1030,7 @@ def harvest_doi(ebook):
return None, -1
return None, 0
def harvest_doi_coaccess(ebook):
# make a new ebook for the "main pub" and ignore the "related pub"
if ebook.url.startswith('https://doi.org/'):
@ -923,7 +1061,8 @@ def harvest_doi_coaccess(ebook):
set_bookshop(ebook)
if format in DOWNLOADABLE:
return make_dl_ebook(url, ebook)
return None, 0
return None, 0
GUID = re.compile(r'FBInit\.GUID = \"([0-9a-z]+)\"')
LIBROSID = re.compile(r'(\d+)$')
@ -947,40 +1086,65 @@ def harvest_libroschile(ebook):
if not guid:
return None, 0
jsonurl = LIBROSJSON % (booknum, guid)
json = requests.get(jsonurl).json()
try:
json = requests.get(jsonurl).json()
except:
return None, 0
if not json:
return None, 0
filename = json.get('downloads',{}).get('url', None)
if not filename:
return None, 0
pdfurl = LIBRODPDF % (booknum, filename, guid)
return make_dl_ebook(pdfurl, ebook)
return make_dl_ebook(pdfurl, ebook)
def harvest_ipsflab(ebook):
def harvest_ipsflab(ebook):
def selector(doc):
return doc.find_all('a', href=re.compile(r'/system/files/ispf_lab/quaderni/.*\.(pdf|epub)'))
return harvest_multiple_generic(ebook, selector)
def harvest_fupress(ebook):
def harvest_figshare(ebook):
def selector(doc):
return doc.find('a', href=re.compile(r'/ndownloader/'))
return harvest_one_generic(ebook, selector)
def harvest_fupress(ebook):
def selector(doc):
return doc.select_one('#ctl00_contenuto_pdf a.btn-open[href]')
if 'isbn' in ebook.url:
set_bookshop(ebook)
return None, 0
return harvest_one_generic(ebook, selector)
def harvest_dunckerhumblot(ebook):
def harvest_funlam(ebook):
if '/modules/' in ebook.url:
set_bookshop(ebook)
return None, 0
return make_dl_ebook(ebook.url, ebook)
def harvest_dunckerhumblot(ebook):
def selector(doc):
return doc.select_one('section.index-card a[href$="download"]')
return doc.select_one('div.section__buttons a[href$="download"]')
return harvest_one_generic(ebook, selector)
def harvest_cornellopen(ebook):
def harvest_cornellopen(ebook):
def selector(doc):
return doc.select('div.sp-product__buy-btn-container li a[href]')
return harvest_multiple_generic(ebook, selector)
def harvest_editorialbonaventuriana(ebook):
def selector(doc):
return doc.select_one('div.djc_fulltext p a[href$=".pdf"]')
return harvest_one_generic(ebook, selector)
def harvest_esv(ebook):
doc = get_soup(ebook.url.replace('details', 'download'))
if doc:
@ -993,17 +1157,20 @@ def harvest_esv(ebook):
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_fulcrum(ebook):
def harvest_fulcrum(ebook):
def selector(doc):
return doc.select('ul.monograph-catalog-rep-downloads a[href]')
return harvest_multiple_generic(ebook, selector)
def harvest_ubiquity(ebook):
def harvest_ubiquity(ebook):
def selector(doc):
return doc.find_all('a', attrs={'data-category': re.compile('(epub|pdf) download')})
return harvest_multiple_generic(ebook, selector)
def harvest_orkana(ebook):
def harvest_orkana(ebook):
def selector(doc):
for obj in doc.find_all('p', string=re.compile(r'\((PDF|E-BOK)\)')):
div = obj.find_parent('div')
@ -1011,12 +1178,14 @@ def harvest_orkana(ebook):
yield div.find_next_sibling('div').find('a')
return harvest_multiple_generic(ebook, selector)
def harvest_euna(ebook):
if '/view/' in ebook.url:
return make_dl_ebook(ebook.url.replace('view', 'download'), ebook)
set_bookshop(ebook)
return None, 0
def harvest_orl(ebook):
if ebook.url.startswith('https://openresearchlibrary.org/viewer/'):
orl_id = ebook.url[39:]
@ -1025,16 +1194,19 @@ def harvest_orl(ebook):
ebook)
return None, 0
def harvest_pressesagro(ebook):
def selector(doc):
return doc.select_one('#sidebar ul li span a[href]')
return harvest_one_generic(ebook, selector)
def harvest_buponline(ebook):
def selector(doc):
return doc.find('a', string=DOWNLOAD)
return harvest_one_generic(ebook, selector)
INTECH = re.compile(r'\.intechopen\.com/books/(\d+)$')
def harvest_intech(ebook):
booknum = INTECH.search(ebook.url)
@ -1043,16 +1215,19 @@ def harvest_intech(ebook):
return make_dl_ebook(url, ebook)
return None, 0
def harvest_usmcu(ebook):
def selector(doc):
return doc.find('a', string='PDF download')
return harvest_one_generic(ebook, selector)
def harvest_upv(ebook):
def selector(doc):
return doc.select_one('a.descargar[href]')
return harvest_one_generic(ebook, selector)
def harvest_una_editions(ebook):
doc = get_soup(ebook.url)
if doc:
@ -1065,10 +1240,19 @@ def harvest_una_editions(ebook):
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_cambridge(ebook):
ebook, status = redirect_ebook(ebook)
doc = get_soup(ebook.url)
if doc:
obj = doc.find('a', string=re.compile('Full book PDF'))
if obj and obj['href']:
dl_url = urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
obj = doc.find('meta', attrs={"name": re.compile("citation_pdf_url")})
if obj and obj['content']:
dl_url = obj['content']
return make_dl_ebook(dl_url, ebook)
pdflinks = []
for obj in doc.select('a[data-pdf-content-id]'):
if obj and obj['href']:
@ -1085,10 +1269,6 @@ def harvest_cambridge(ebook):
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_iupress(ebook):
def selector(doc):
return doc.find_all('a', string=re.compile(r'(Full Text \(PDF\)|e-PUB)'))
return harvest_multiple_generic(ebook, selector)
def harvest_exon(ebook):
doc = get_soup(ebook.url)
@ -1109,11 +1289,13 @@ def harvest_exon(ebook):
logger.warning('couldn\'t get soup for %s', ebook.url)
return None, 0
def harvest_una(ebook):
def selector(doc):
return doc.select_one('#header-primary-action a[href]')
return harvest_one_generic(ebook, selector)
def harvest_wbg(ebook):
''' most of these are archived under files.wbg-wissenverbindet.de '''
doc = get_soup(ebook.url)
@ -1125,7 +1307,95 @@ def harvest_wbg(ebook):
return make_dl_ebook(url, ebook)
return None, 0
def harvest_kb(ebook):
def selector(doc):
return doc.select_one('a[title=fulltext][href]')
return harvest_one_generic(ebook, selector)
def harvest_istanbul(ebook):
def cdn_url(soup):
objs = soup.find_all('a', href=re.compile(r'cdn\.istanbul'))
for obj in objs:
yield obj['href']
def pdf_urls(ebook):
doc = get_soup(ebook.url, user_agent=settings.GOOGLEBOT_UA, follow_redirects=True)
if doc:
for content_url in cdn_url(doc):
yield content_url
for obj in doc.select('div.post-content h5 a.from-journal[href]'):
chap_url = urljoin(ebook.url, obj['href'])
chap_doc = get_soup(chap_url, user_agent=settings.GOOGLEBOT_UA, follow_redirects=True)
if chap_doc:
for content_url in cdn_url(chap_doc):
yield content_url
# staple the chapters
stapled = make_stapled_ebook(pdf_urls(ebook), ebook, user_agent=settings.GOOGLEBOT_UA)
if stapled:
return stapled
else:
logger.warning('couldn\'t make ebook file for %s', ebook.url)
return None, 0
def harvest_gta(ebook):
# https://verlag.gta.arch.ethz.ch/en/gta:book_978-3-85676-393-0
pos = ebook.url.find('_')
if pos < 1:
return None, 0
isbn = ebook.url[pos + 1:]
api_host = 'https://api.verlag.gta.arch.ethz.ch'
json_url = f'{api_host}/api/v1/graphs/gta/data/gtaapi:PublicRetrieveBook/gta:book_{isbn}/'
r = requests.get(json_url)
if r.status_code == 200:
try:
file_url = None
graph = r.json()['@graph']
for obj in graph:
if "gtaapi:file_url" in obj:
file_url = obj["gtaapi:file_url"]
break
if file_url:
return make_dl_ebook(file_url, ebook)
except IndexError:
logger.error('no item_file for %s', ebook.url)
return None, 0
def harvest_manu(ebook):
def chap_selector(doc):
return doc.select('div.content-box-body div.book-toc a.c-Button--link[href*="/display/"]')
def dl(url):
return url.replace('/display/', '/downloadpdf/').replace('.xml', '.pdf')
doc = get_soup(ebook.url, follow_redirects=True, user_agent=settings.CHROME_UA)
if doc:
obj = doc.find('a', string=re.compile(r"Open Access"))
if not obj or 'href' not in obj.attrs:
return None, 0
ebook.url = urljoin(ebook.url, obj['href'])
return harvest_stapled_generic(ebook, lambda x: None, chap_selector,
user_agent=settings.CHROME_UA, dl=dl)
return None, 0
def harvest_sciendo(ebook):
def selector(doc):
json_obj = doc.find('script', id='__NEXT_DATA__')
if json_obj:
try:
json_data = json.loads(json_obj.string)
pdf_url = json_data['props']['pageProps']['product']['pdfLink']
epub_url = json_data['props']['pageProps']['product']['epubLink']
if pdf_url or epub_url:
if pdf_url:
yield {'href': pdf_url}
if epub_url:
yield {'href': epub_url}
except json.JSONDecodeError as je:
logger.error(f'Bad json {je.msg}')
except KeyError as ke:
logger.error('No links in json for {ebook.url}')
return harvest_multiple_generic(ebook, selector)

View File

@ -2,29 +2,26 @@ from django.core.management.base import BaseCommand
from regluit.core.loaders.doab_utils import online_to_download
from regluit.core.models import Ebook
from regluit.core.models.loader import type_for_url
class Command(BaseCommand):
help = "fix 'online' ebooks"
help = "deactivate dead oapen ebooks"
args = "<limit>"
def add_arguments(self, parser):
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to fix")
def handle(self, limit=0, **options):
limit = int(limit) if limit else 0
onlines = Ebook.objects.filter(format='online', provider='SciELO')
onlines = Ebook.objects.filter(active=1, provider='OAPEN Library',
url__contains='/download/')
done = 0
for online in onlines:
urls = online_to_download(online.url)
for url in urls:
online.format = type_for_url(url, force=True)
online.active = True
online.save()
done += 1
self.stdout.write(online.edition.work.title)
online.active = False
online.save()
done += 1
#self.stdout.write(online.edition.work.title)
if done > limit:
break
self.stdout.write('fixed {} ebooks'.format(done))
if done == 100:
self.stdout.write('100 is the maximum; repeat to do more')
if done >= 1000:
self.stdout.write('1000 is the maximum; repeat to do more')

View File

@ -18,3 +18,15 @@ class Command(BaseCommand):
else:
books = load_springer(int(startpage), int(endpage))
self.stdout.write("loaded {} books".format(len(books)))
for edition in books:
done_fmt = set()
for ebook in edition.work.ebooks_all():
for fmt in ['pdf', 'epub', 'mobi']:
if ebook.format == fmt:
if fmt not in done_fmt:
ebook.activate()
done_fmt.add(fmt)
else:
ebook.deactivate()

View File

@ -1078,7 +1078,7 @@ def safe_get_work(work_id):
return work
def path_for_file(instance, filename):
return "ebf/{}.{}".format(uuid.uuid4().hex, instance.format)
return f"ebf/{uuid.uuid4().hex}.{instance.format}"
class EbookFile(models.Model):
file = models.FileField(upload_to=path_for_file)

View File

@ -203,9 +203,9 @@ function put_un_in_cookie2(){
<div class="jsmodule">
<h3 class="module-title">Donate!</h3>
<div class="jsmod-content">
<div>Please help support Unglue.it by making a tax-deductible donation to the Free Ebook Foundation. Donations are currently directed to our <a href="{% url "about_funds" %}#monographs">Monographs Matching Fund</a>.</div>
<div>Please help support Unglue.it by making a tax-deductible donation to the Free Ebook Foundation.</div>
<form class="askform" method="POST" action="{% url 'newdonation' %}">
<input type="hidden" value="monographs" name="reason">
<input type="hidden" value="general" name="reason">
<div class="donate_amount">
<label>Amount ($): </label><input id="amount" max="20000.00" min="5.00" name="amount" step="0.01" type="number" value="10.00" class="donate"></div>
<div class="button">

View File

@ -33,6 +33,7 @@ from django.core.mail import EmailMessage
from django.urls import reverse, reverse_lazy
from django.core.validators import validate_email
from django.db.models import Q, Count, Sum
from django.db.utils import IntegrityError
from django.forms import Select
from django.forms.models import inlineformset_factory
from django.http import (
@ -556,7 +557,7 @@ def googlebooks(request, googlebooks_id):
return HttpResponseNotFound("failed looking up googlebooks id %s" % googlebooks_id)
try:
edition = bookloader.add_by_googlebooks_id(googlebooks_id)
if edition.new:
if edition and edition.new:
# add related editions asynchronously
tasks.populate_edition.delay(edition.isbn_13)
if request.user.is_authenticated:
@ -564,6 +565,10 @@ def googlebooks(request, googlebooks_id):
except bookloader.LookupFailure:
logger.warning("failed to load googlebooks_id %s" % googlebooks_id)
return HttpResponseNotFound("failed looking up googlebooks id %s" % googlebooks_id)
except IntegrityError:
logger.warning("duplicate (maybe) googlebooks_id %s" % googlebooks_id)
return HttpResponseNotFound("failed adding googlebooks id %s" % googlebooks_id)
if not edition:
return HttpResponseNotFound("invalid googlebooks id")
work_url = reverse('work', kwargs={'work_id': edition.work_id})

View File

@ -26,7 +26,7 @@ _ends_in_num = re.compile(r'\W*\d+$')
def remove_badxml(s):
return _illegal_xml_chars_RE.sub('', s)
_ws_runs_RE = re.compile(r'[\r\n\t]+')
_ws_runs_RE = re.compile(r'([\r\n\t]| \$b)+')
def sanitize_ws(s):
return _ws_runs_RE.sub(u' ', s)
@ -39,3 +39,4 @@ def remove_author_junk(authname):
if 'ORCID:' in authname:
authname = authname.split('ORCID:')[0].strip()
return _ends_in_num.sub('', authname)