benjamins, kiel, UA

pull/94/head
eric 2024-05-06 12:45:47 -04:00
parent 46dd2fed25
commit 454acbe3ef
3 changed files with 30 additions and 1 deletions

View File

@ -82,6 +82,7 @@ STOREPROVIDERS = [
'karolinum.cz',
'librumstore.com',
'logos-verlag.de',
'universitaetsverlag.uni-kiel.de',
'manchesteruniversitypress.co.uk',
'mitpress.mit.edu',
'munishop.muni.cz',

View File

@ -247,6 +247,8 @@ def harvesters(ebook):
yield ebook.provider == 'verlag.gta.arch.ethz.ch', harvest_gta
yield ebook.provider == 'manchesteruniversitypress.co.uk', harvest_manu
yield ebook.provider == 'tectum-elibrary.de', harvest_tecnum
yield ebook.provider == 'benjamins.com', harvest_benjamins
yield ebook.provider == 'macau.uni-kiel.de', harvest_citation_meta_generic
def ebf_if_harvested(url):
@ -1454,6 +1456,7 @@ def harvest_sciendo(ebook):
logger.error('No links in json for {ebook.url}')
return harvest_multiple_generic(ebook, selector)
# 2step
def harvest_liege(ebook):
def selector(doc):
urls = []
@ -1486,4 +1489,28 @@ def harvest_liege(ebook):
return harvest_multiple_generic(ebook, selector)
# 2step
def harvest_benjamins(ebook):
def selector(doc):
urls = []
page = doc.find('a', href=re.compile(r'jbe-platform.com'))
if page:
base = page['href']
base_doc = get_soup(base, follow_redirects=True)
if base_doc:
links = base_doc.select('.access-options a[href]')
for link in links:
dl_url = urljoin(base, link['href'])
yield {'href': dl_url}
return harvest_multiple_generic(ebook, selector)
def harvest_citation_meta_generic(ebook):
def selector(doc):
citation_pdf_url = get_meta(doc, "citation_pdf_url")
citation_epub_url = get_meta(doc, "citation_epub_url")
if citation_pdf_url or citation_epub_url:
if citation_pdf_url:
yield {'href': citation_pdf_url}
if citation_epub_url:
yield {'href': citation_epub_url}
return harvest_multiple_generic(ebook, selector)

View File

@ -503,7 +503,8 @@ FILE_UPLOAD_MAX_MEMORY_SIZE = 20971520 #20MB
FIREFOX_PATH = ''
CHROMEDRIVER_PATH = ''
GOOGLEBOT_UA = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'
try:
from .keys.common import *