add degruyter handling

- move harvest to separate module
- add ratelimiter class
- add pdf stapler
- add a googlebot UA
- add base url storage in get_soup
pull/94/head
eric 2019-02-28 15:32:41 -05:00
parent 14ecd864f0
commit 72a40976bc
6 changed files with 182 additions and 82 deletions

147
core/loaders/harvest.py Normal file
View File

@ -0,0 +1,147 @@
import logging
import re
import time
import urlparse
import requests
from django.conf import settings
from django.core.files.base import ContentFile
from regluit.core.models import (
Ebook, EbookFile, path_for_file,
)
from regluit.core.pdf import staple_pdf
from .utils import get_soup
logger = logging.getLogger(__name__)
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
DELAY = 5.0
class RateLimiter(object):
def __init__(self):
self.last = {}
def delay(self, provider):
if provider in self.last:
prev = self.last[provider]
pres = time.time()
if pres - prev < DELAY:
time.sleep(float(DELAY - pres + prev))
self.last[provider] = time.time()
return
rl = RateLimiter()
def dl_online(ebook, limiter=rl.delay):
if ebook.format != 'online':
pass
elif ebook.url.find(u'dropbox.com/s/') >= 0:
for ebf in ebf_if_harvested(ebook.url):
return ebf, False
limiter(ebook.provider)
if ebook.url.find(u'dl=0') >= 0:
dl_url = ebook.url.replace(u'dl=0', u'dl=1')
return make_dl_ebook(dl_url, ebook)
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
match_dl = DROPBOX_DL.search(response.content)
if match_dl:
return make_dl_ebook(match_dl.group(1), ebook)
else:
logger.warning('couldn\'t get {}'.format(ebook.url))
else:
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
for ebf in ebf_if_harvested(ebook.url):
return ebf, False
limiter(ebook.provider)
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
if obj:
dl_url = urlparse.urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
else:
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
elif ebook.url.find(u'degruyter') >= 0:
for ebf in ebf_if_harvested(ebook.url):
return ebf, False
limiter(ebook.provider)
doc = get_soup(ebook.url, settings.GOOGLEBOT_UA)
if doc:
try:
base = doc.find('base')['href']
except:
base = ebook.url
made = None
obj = doc.select_one('a.epub-link')
if obj:
dl_url = urlparse.urljoin(base, obj['href'])
made = make_dl_ebook(dl_url, ebook)
pdflinks = [urlparse.urljoin(base, a['href']) for a in doc.select('a.pdf-link')]
if pdflinks:
made = make_stapled_ebook(pdflinks, ebook)
if made:
return made
else:
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
else:
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
return None, False
def ebf_if_harvested(url):
onlines = EbookFile.objects.filter(source=url)
if onlines:
return onlines
return EbookFile.objects.none()
def make_dl_ebook(url, ebook):
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
filesize = int(response.headers.get("Content-Length", 0))
filesize = filesize if filesize else None
format = type_for_url(url, content_type=response.headers.get('content-type'))
if format != 'online':
return make_harvested_ebook(response.content, ebook, format, filesize=filesize)
else:
logger.warning('download format for {} is not ebook'.format(url))
else:
logger.warning('couldn\'t get {}'.format(url))
return None, False
def make_stapled_ebook(urllist, ebook):
pdffile = staple_pdf(urllist, settings.GOOGLEBOT_UA)
if not pdffile:
return None, False
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
def make_harvested_ebook(content, ebook, format, filesize=0):
if not filesize:
filesize = len(content)
new_ebf = EbookFile.objects.create(
edition=ebook.edition,
format=format,
source=ebook.url,
)
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(content))
new_ebf.save()
new_ebook = Ebook.objects.create(
edition=ebook.edition,
format=format,
provider='Unglue.it',
url=new_ebf.file.url,
rights=ebook.rights,
filesize=filesize,
version_label=ebook.version_label,
version_iter=ebook.version_iter,
)
new_ebf.ebook = new_ebook
new_ebf.save()
return new_ebf, True

View File

@ -1,7 +1,7 @@
from django.conf import settings
from django.test import TestCase
from regluit.core.models import Ebook, Edition, Work
from .utils import dl_online
from .harvest import dl_online
class LoaderTests(TestCase):
def setUp(self):

View File

@ -42,10 +42,15 @@ def utf8_general_ci_norm(s):
s1 = unicodedata.normalize('NFD', s)
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
def get_soup(url):
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
def get_soup(url, user_agent=settings.USER_AGENT):
response = requests.get(url, headers={"User-Agent": user_agent})
if response.status_code == 200:
return BeautifulSoup(response.content, 'lxml')
soup = BeautifulSoup(response.content, 'lxml')
# make sure document has a base
if not soup.find('base'):
soup.find('head').append(soup.new_tag("base", href=response.url))
return soup
return None
def get_authors(book):
@ -370,74 +375,6 @@ def ids_from_urls(url):
ids[ident] = id_match.group('id')
return ids
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
def dl_online(ebook):
if ebook.format != 'online':
pass
elif ebook.url.find(u'dropbox.com/s/') >= 0:
if ebook.url.find(u'dl=0') >= 0:
dl_url = ebook.url.replace(u'dl=0', u'dl=1')
return make_dl_ebook(dl_url, ebook)
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
match_dl = DROPBOX_DL.search(response.content)
if match_dl:
return make_dl_ebook(match_dl.group(1), ebook)
else:
logger.warning('couldn\'t get {}'.format(ebook.url))
else:
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
if obj:
dl_url = urlparse.urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
else:
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
return None, False
def make_dl_ebook(url, ebook):
if EbookFile.objects.filter(source=ebook.url):
return EbookFile.objects.filter(source=ebook.url)[0], False
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
filesize = int(response.headers.get("Content-Length", 0))
filesize = filesize if filesize else None
format = type_for_url(url, content_type=response.headers.get('content-type'))
if format != 'online':
new_ebf = EbookFile.objects.create(
edition=ebook.edition,
format=format,
source=ebook.url,
)
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
new_ebf.save()
new_ebook = Ebook.objects.create(
edition=ebook.edition,
format=format,
provider='Unglue.it',
url=new_ebf.file.url,
rights=ebook.rights,
filesize=filesize,
version_label=ebook.version_label,
version_iter=ebook.version_iter,
)
new_ebf.ebook = new_ebook
new_ebf.save()
return new_ebf, True
else:
logger.warning('download format for {} is not ebook'.format(url))
else:
logger.warning('couldn\'t get {}'.format(url))
return None, False
def type_for_url(url, content_type=None):
if not url:
return ''

View File

@ -1,6 +1,6 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders.utils import dl_online
from regluit.core.loaders.harvest import dl_online, RateLimiter
from regluit.core.models import Ebook
class Command(BaseCommand):
@ -8,14 +8,15 @@ class Command(BaseCommand):
args = "<limit>"
def add_arguments(self, parser):
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
def handle(self, limit=0, **options):
limit = int(limit) if limit else 0
rl = RateLimiter()
onlines = Ebook.objects.filter(format='online')
done = 0
for online in onlines:
new_ebf, new = dl_online(online)
new_ebf, new = dl_online(online, limiter=rl.delay)
if new_ebf and new:
done += 1
if done == limit or done == 50:

View File

@ -6,6 +6,7 @@ import requests
from xhtml2pdf import pisa # import python module
from PyPDF2 import PdfFileMerger,PdfFileReader
from StringIO import StringIO
from io import BytesIO
from tempfile import NamedTemporaryFile
from django.template.loader import render_to_string
from regluit import settings
@ -60,10 +61,23 @@ def test_pdf(pdf_file):
logger.exception('error testing a pdf: %s' % pdf_file[:100])
return False
def test_test_pdf(self):
assert(test_pdf(settings.TEST_PDF_URL))
temp = NamedTemporaryFile(delete=False)
test_file_content = requests.get(settings.TEST_PDF_URL).content
temp.write(test_file_content)
assert test_pdf(temp)
temp.close()
def staple_pdf(urllist, user_agent=None):
user_agent = user_agent if user_agent else settings.USER_AGENT
merger = PdfFileMerger(strict=False)
for url in urllist:
response = requests.get(url, headers={"User-Agent": user_agent})
if response.status_code == 200:
merger.append(BytesIO(response.content))
else:
return None
out = BytesIO()
merger.write(out)
return out
def test_test_pdf():
assert(test_pdf(settings.TEST_PDF_URL))
temp = NamedTemporaryFile(delete=False)
test_file_content = requests.get(settings.TEST_PDF_URL).content
temp.write(test_file_content)
assert test_pdf(temp)
temp.close()

View File

@ -480,6 +480,7 @@ QUESTIONNAIRE_SHOW_ITEM_RESULTS = False
# Selenium related -- set if Se tests run
FIREFOX_PATH = ''
CHROMEDRIVER_PATH = ''
GOOGLEBOT_UA = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
try:
from .keys.common import *