From 21cf79e382044e29af532764fcfec0dffec43aca Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 1 Oct 2020 19:58:28 -0400 Subject: [PATCH] add libroschile --- core/loaders/harvest.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/core/loaders/harvest.py b/core/loaders/harvest.py index f4d43bc9..079bf5ef 100644 --- a/core/loaders/harvest.py +++ b/core/loaders/harvest.py @@ -140,6 +140,7 @@ def harvesters(ebook): yield 'brillonline' in ebook.provider, harvest_brill yield ebook.provider == 'DOI Resolver', harvest_doi yield ebook.provider == 'ispf-lab.cnr.it', harvest_ipsflab + yield ebook.provider == 'libros.uchile.cl', harvest_libroschile def ebf_if_harvested(url): @@ -777,6 +778,35 @@ def harvest_doi(ebook): ebook.save() return None, 0 +GUID = re.compile(r'FBInit\.GUID = \"([0-9a-z]+)\"') +LIBROSID = re.compile(r'(\d+)$') +LIBROSROOT = 'https://libros.uchile.cl/files/presses/1/monographs/%s/submission/proof/' +LIBROSINDEX = LIBROSROOT + 'index.html' +LIBROSJSON = LIBROSROOT + 'files/assets/html/workspace.js?uni=%s' +LIBRODPDF = LIBROSROOT + 'files/assets/common/downloads/%s?uni=%s' + +def harvest_libroschile(ebook): + booknum = LIBROSID.search(ebook.url).group(1) + if not booknum: + return None, 0 + viewurl = LIBROSINDEX % booknum + doc = get_soup(viewurl) + if not doc: + return None, 0 + hit = doc.find(string=GUID) + if not hit: + return None, 0 + guid = GUID.search(hit) + if not guid: + return None, 0 + jsonurl = LIBROSJSON % (booknum, guid) + json = requests.get(jsonurl).json() + if not json: + return None, 0 + filename = json['downloads']['url'] + pdfurl = LIBRODPDF % (booknum, filename, guid) + return make_dl_ebook(pdfurl, ebook) + def harvest_ipsflab(ebook): def selector(doc):