add degruyter handling
- move harvest to separate module - add ratelimiter class - add pdf stapler - add a googlebot UA - add base url storage in get_souppull/94/head
parent
14ecd864f0
commit
72a40976bc
|
@ -0,0 +1,147 @@
|
|||
import logging
|
||||
import re
|
||||
import time
|
||||
import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.files.base import ContentFile
|
||||
|
||||
from regluit.core.models import (
|
||||
Ebook, EbookFile, path_for_file,
|
||||
)
|
||||
from regluit.core.pdf import staple_pdf
|
||||
|
||||
from .utils import get_soup
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
|
||||
DELAY = 5.0
|
||||
|
||||
class RateLimiter(object):
|
||||
def __init__(self):
|
||||
self.last = {}
|
||||
|
||||
def delay(self, provider):
|
||||
if provider in self.last:
|
||||
prev = self.last[provider]
|
||||
pres = time.time()
|
||||
if pres - prev < DELAY:
|
||||
time.sleep(float(DELAY - pres + prev))
|
||||
self.last[provider] = time.time()
|
||||
return
|
||||
rl = RateLimiter()
|
||||
|
||||
def dl_online(ebook, limiter=rl.delay):
|
||||
if ebook.format != 'online':
|
||||
pass
|
||||
elif ebook.url.find(u'dropbox.com/s/') >= 0:
|
||||
for ebf in ebf_if_harvested(ebook.url):
|
||||
return ebf, False
|
||||
limiter(ebook.provider)
|
||||
if ebook.url.find(u'dl=0') >= 0:
|
||||
dl_url = ebook.url.replace(u'dl=0', u'dl=1')
|
||||
return make_dl_ebook(dl_url, ebook)
|
||||
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
match_dl = DROPBOX_DL.search(response.content)
|
||||
if match_dl:
|
||||
return make_dl_ebook(match_dl.group(1), ebook)
|
||||
else:
|
||||
logger.warning('couldn\'t get {}'.format(ebook.url))
|
||||
else:
|
||||
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
|
||||
|
||||
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
|
||||
for ebf in ebf_if_harvested(ebook.url):
|
||||
return ebf, False
|
||||
limiter(ebook.provider)
|
||||
doc = get_soup(ebook.url)
|
||||
if doc:
|
||||
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
|
||||
if obj:
|
||||
dl_url = urlparse.urljoin(ebook.url, obj['href'])
|
||||
return make_dl_ebook(dl_url, ebook)
|
||||
else:
|
||||
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
|
||||
else:
|
||||
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
|
||||
elif ebook.url.find(u'degruyter') >= 0:
|
||||
for ebf in ebf_if_harvested(ebook.url):
|
||||
return ebf, False
|
||||
limiter(ebook.provider)
|
||||
doc = get_soup(ebook.url, settings.GOOGLEBOT_UA)
|
||||
if doc:
|
||||
try:
|
||||
base = doc.find('base')['href']
|
||||
except:
|
||||
base = ebook.url
|
||||
made = None
|
||||
obj = doc.select_one('a.epub-link')
|
||||
if obj:
|
||||
dl_url = urlparse.urljoin(base, obj['href'])
|
||||
made = make_dl_ebook(dl_url, ebook)
|
||||
pdflinks = [urlparse.urljoin(base, a['href']) for a in doc.select('a.pdf-link')]
|
||||
if pdflinks:
|
||||
made = make_stapled_ebook(pdflinks, ebook)
|
||||
if made:
|
||||
return made
|
||||
else:
|
||||
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
|
||||
else:
|
||||
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
|
||||
|
||||
return None, False
|
||||
|
||||
def ebf_if_harvested(url):
|
||||
onlines = EbookFile.objects.filter(source=url)
|
||||
if onlines:
|
||||
return onlines
|
||||
return EbookFile.objects.none()
|
||||
|
||||
def make_dl_ebook(url, ebook):
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
filesize = int(response.headers.get("Content-Length", 0))
|
||||
filesize = filesize if filesize else None
|
||||
format = type_for_url(url, content_type=response.headers.get('content-type'))
|
||||
if format != 'online':
|
||||
return make_harvested_ebook(response.content, ebook, format, filesize=filesize)
|
||||
else:
|
||||
logger.warning('download format for {} is not ebook'.format(url))
|
||||
else:
|
||||
logger.warning('couldn\'t get {}'.format(url))
|
||||
return None, False
|
||||
|
||||
def make_stapled_ebook(urllist, ebook):
|
||||
pdffile = staple_pdf(urllist, settings.GOOGLEBOT_UA)
|
||||
if not pdffile:
|
||||
return None, False
|
||||
return make_harvested_ebook(pdffile.getvalue(), ebook, 'pdf')
|
||||
|
||||
def make_harvested_ebook(content, ebook, format, filesize=0):
|
||||
if not filesize:
|
||||
filesize = len(content)
|
||||
new_ebf = EbookFile.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
source=ebook.url,
|
||||
)
|
||||
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(content))
|
||||
new_ebf.save()
|
||||
new_ebook = Ebook.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
provider='Unglue.it',
|
||||
url=new_ebf.file.url,
|
||||
rights=ebook.rights,
|
||||
filesize=filesize,
|
||||
version_label=ebook.version_label,
|
||||
version_iter=ebook.version_iter,
|
||||
)
|
||||
new_ebf.ebook = new_ebook
|
||||
new_ebf.save()
|
||||
return new_ebf, True
|
|
@ -1,7 +1,7 @@
|
|||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
from regluit.core.models import Ebook, Edition, Work
|
||||
from .utils import dl_online
|
||||
from .harvest import dl_online
|
||||
|
||||
class LoaderTests(TestCase):
|
||||
def setUp(self):
|
||||
|
|
|
@ -42,10 +42,15 @@ def utf8_general_ci_norm(s):
|
|||
s1 = unicodedata.normalize('NFD', s)
|
||||
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
|
||||
|
||||
def get_soup(url):
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
def get_soup(url, user_agent=settings.USER_AGENT):
|
||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||
if response.status_code == 200:
|
||||
return BeautifulSoup(response.content, 'lxml')
|
||||
soup = BeautifulSoup(response.content, 'lxml')
|
||||
|
||||
# make sure document has a base
|
||||
if not soup.find('base'):
|
||||
soup.find('head').append(soup.new_tag("base", href=response.url))
|
||||
return soup
|
||||
return None
|
||||
|
||||
def get_authors(book):
|
||||
|
@ -370,74 +375,6 @@ def ids_from_urls(url):
|
|||
ids[ident] = id_match.group('id')
|
||||
return ids
|
||||
|
||||
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
|
||||
|
||||
def dl_online(ebook):
|
||||
if ebook.format != 'online':
|
||||
pass
|
||||
elif ebook.url.find(u'dropbox.com/s/') >= 0:
|
||||
if ebook.url.find(u'dl=0') >= 0:
|
||||
dl_url = ebook.url.replace(u'dl=0', u'dl=1')
|
||||
return make_dl_ebook(dl_url, ebook)
|
||||
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
match_dl = DROPBOX_DL.search(response.content)
|
||||
if match_dl:
|
||||
return make_dl_ebook(match_dl.group(1), ebook)
|
||||
else:
|
||||
logger.warning('couldn\'t get {}'.format(ebook.url))
|
||||
else:
|
||||
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
|
||||
|
||||
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
|
||||
doc = get_soup(ebook.url)
|
||||
if doc:
|
||||
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
|
||||
if obj:
|
||||
dl_url = urlparse.urljoin(ebook.url, obj['href'])
|
||||
return make_dl_ebook(dl_url, ebook)
|
||||
else:
|
||||
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
|
||||
else:
|
||||
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
|
||||
|
||||
return None, False
|
||||
|
||||
def make_dl_ebook(url, ebook):
|
||||
if EbookFile.objects.filter(source=ebook.url):
|
||||
return EbookFile.objects.filter(source=ebook.url)[0], False
|
||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||
if response.status_code == 200:
|
||||
filesize = int(response.headers.get("Content-Length", 0))
|
||||
filesize = filesize if filesize else None
|
||||
format = type_for_url(url, content_type=response.headers.get('content-type'))
|
||||
if format != 'online':
|
||||
new_ebf = EbookFile.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
source=ebook.url,
|
||||
)
|
||||
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
|
||||
new_ebf.save()
|
||||
new_ebook = Ebook.objects.create(
|
||||
edition=ebook.edition,
|
||||
format=format,
|
||||
provider='Unglue.it',
|
||||
url=new_ebf.file.url,
|
||||
rights=ebook.rights,
|
||||
filesize=filesize,
|
||||
version_label=ebook.version_label,
|
||||
version_iter=ebook.version_iter,
|
||||
)
|
||||
new_ebf.ebook = new_ebook
|
||||
new_ebf.save()
|
||||
return new_ebf, True
|
||||
else:
|
||||
logger.warning('download format for {} is not ebook'.format(url))
|
||||
else:
|
||||
logger.warning('couldn\'t get {}'.format(url))
|
||||
return None, False
|
||||
|
||||
def type_for_url(url, content_type=None):
|
||||
if not url:
|
||||
return ''
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from django.core.management.base import BaseCommand
|
||||
|
||||
from regluit.core.loaders.utils import dl_online
|
||||
from regluit.core.loaders.harvest import dl_online, RateLimiter
|
||||
from regluit.core.models import Ebook
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
@ -8,14 +8,15 @@ class Command(BaseCommand):
|
|||
args = "<limit>"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
|
||||
parser.add_argument('limit', nargs='?', type=int, default=0, help="max to harvest")
|
||||
|
||||
def handle(self, limit=0, **options):
|
||||
limit = int(limit) if limit else 0
|
||||
rl = RateLimiter()
|
||||
onlines = Ebook.objects.filter(format='online')
|
||||
done = 0
|
||||
for online in onlines:
|
||||
new_ebf, new = dl_online(online)
|
||||
new_ebf, new = dl_online(online, limiter=rl.delay)
|
||||
if new_ebf and new:
|
||||
done += 1
|
||||
if done == limit or done == 50:
|
||||
|
|
28
core/pdf.py
28
core/pdf.py
|
@ -6,6 +6,7 @@ import requests
|
|||
from xhtml2pdf import pisa # import python module
|
||||
from PyPDF2 import PdfFileMerger,PdfFileReader
|
||||
from StringIO import StringIO
|
||||
from io import BytesIO
|
||||
from tempfile import NamedTemporaryFile
|
||||
from django.template.loader import render_to_string
|
||||
from regluit import settings
|
||||
|
@ -60,10 +61,23 @@ def test_pdf(pdf_file):
|
|||
logger.exception('error testing a pdf: %s' % pdf_file[:100])
|
||||
return False
|
||||
|
||||
def test_test_pdf(self):
|
||||
assert(test_pdf(settings.TEST_PDF_URL))
|
||||
temp = NamedTemporaryFile(delete=False)
|
||||
test_file_content = requests.get(settings.TEST_PDF_URL).content
|
||||
temp.write(test_file_content)
|
||||
assert test_pdf(temp)
|
||||
temp.close()
|
||||
def staple_pdf(urllist, user_agent=None):
|
||||
user_agent = user_agent if user_agent else settings.USER_AGENT
|
||||
merger = PdfFileMerger(strict=False)
|
||||
for url in urllist:
|
||||
response = requests.get(url, headers={"User-Agent": user_agent})
|
||||
if response.status_code == 200:
|
||||
merger.append(BytesIO(response.content))
|
||||
else:
|
||||
return None
|
||||
out = BytesIO()
|
||||
merger.write(out)
|
||||
return out
|
||||
|
||||
def test_test_pdf():
|
||||
assert(test_pdf(settings.TEST_PDF_URL))
|
||||
temp = NamedTemporaryFile(delete=False)
|
||||
test_file_content = requests.get(settings.TEST_PDF_URL).content
|
||||
temp.write(test_file_content)
|
||||
assert test_pdf(temp)
|
||||
temp.close()
|
||||
|
|
|
@ -480,6 +480,7 @@ QUESTIONNAIRE_SHOW_ITEM_RESULTS = False
|
|||
# Selenium related -- set if Se tests run
|
||||
FIREFOX_PATH = ''
|
||||
CHROMEDRIVER_PATH = ''
|
||||
GOOGLEBOT_UA = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||
|
||||
try:
|
||||
from .keys.common import *
|
||||
|
|
Loading…
Reference in New Issue