regluit/core/pdf.py

"""
Utilities that manipulate pdf files
"""
import logging
from io import BytesIO
from StringIO import StringIO
from tempfile import NamedTemporaryFile

import requests
from xhtml2pdf import pisa             # import python module
from PyPDF2 import PdfFileMerger, PdfFileReader
from PyPDF2.utils import PdfReadError
from django.template.loader import render_to_string
from django.conf import settings

logger = logging.getLogger(__name__)

# Utility function
def ask_pdf(context={}):
    ask_html = StringIO(unicode(render_to_string('pdf/ask.html', context)))
    # open output file for writing (truncated binary)
    resultFile = StringIO()

    # convert HTML to PDF
    pisaStatus = pisa.CreatePDF(
        src=ask_html,                # the HTML to convert
        dest=resultFile,           # file  to recieve result
    )
    #  True on success and False on errors
    assert pisaStatus.err == 0
    return resultFile

def pdf_append(file1, file2, file_out):
    merger = PdfFileMerger(strict=False)
    merger.append(file1)
    merger.append(file2)
    merger.write(file_out)
    merger.close()

def test_pdf(pdf_file):
    temp = None
    try:
        if isinstance(pdf_file, (str, unicode)):
            if pdf_file.startswith('http:') or pdf_file.startswith('https:'):
                temp = NamedTemporaryFile(delete=False)
                test_file_content = requests.get(pdf_file).content
                temp.write(test_file_content)
                temp.seek(0)
            else:
                # hope it's already a file
                temp = open(pdf_file, mode='rb')
        else:
            pdf_file.seek(0)
            temp = pdf_file
        try:
            PdfFileReader(temp)
            success = True
        except:
            success = False
        return success
    except Exception:
        pdf_file = unicode(pdf_file)
        logger.exception('error testing a pdf: %s' % pdf_file[:100])
        return False

def staple_pdf(urllist, user_agent=settings.USER_AGENT):
    merger = PdfFileMerger(strict=False)
    s = requests.Session()
    for url in urllist:
        try:
            response = s.get(url, headers={"User-Agent": user_agent})
        except requests.exceptions.ConnectionError:
            logger.error("Error getting url: %s", url)
            return None
        if response.status_code == 200:
            try:
                merger.append(BytesIO(response.content))
            except PdfReadError:
                logger.error("error reading pdf url: %s", url)
                return None
        else:
            return None
    out = BytesIO()
    try:
        merger.write(out)
    except PdfReadError:
        logger.error("error writing pdf url: %s", url)
        return None
    return out

def test_test_pdf():
    assert(test_pdf(settings.TEST_PDF_URL))
    temp = NamedTemporaryFile(delete=False)
    test_file_content = requests.get(settings.TEST_PDF_URL).content
    temp.write(test_file_content)
    assert test_pdf(temp)
    temp.close()
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00			`"""`
			`Utilities that manipulate pdf files`
			`"""`
fix pdf tester boto3 storage returns unicode not str someplace 2018-06-08 17:56:07 +00:00			`import logging`
add degruyter handling - move harvest to separate module - add ratelimiter class - add pdf stapler - add a googlebot UA - add base url storage in get_soup 2019-02-28 20:32:41 +00:00			`from io import BytesIO`
delint 2019-02-28 21:22:23 +00:00			`from StringIO import StringIO`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00			`from tempfile import NamedTemporaryFile`
delint 2019-02-28 21:22:23 +00:00
			`import requests`
			`from xhtml2pdf import pisa # import python module`
			`from PyPDF2 import PdfFileMerger, PdfFileReader`
refinements - handle dropbox urls with no params - catch exceptions in stapler - fix dedupe summary 2019-03-03 00:16:47 +00:00			`from PyPDF2.utils import PdfReadError`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00			`from django.template.loader import render_to_string`
delint 2019-02-28 21:22:23 +00:00			`from django.conf import settings`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00
fix pdf tester boto3 storage returns unicode not str someplace 2018-06-08 17:56:07 +00:00			`logger = logging.getLogger(__name__)`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00
			`# Utility function`
			`def ask_pdf(context={}):`
			`ask_html = StringIO(unicode(render_to_string('pdf/ask.html', context)))`
			`# open output file for writing (truncated binary)`
			`resultFile = StringIO()`

			`# convert HTML to PDF`
			`pisaStatus = pisa.CreatePDF(`
delint 2019-02-28 21:22:23 +00:00			`src=ask_html, # the HTML to convert`
			`dest=resultFile, # file to recieve result`
			`)`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00			`# True on success and False on errors`
			`assert pisaStatus.err == 0`
			`return resultFile`

delint 2019-02-28 21:22:23 +00:00			`def pdf_append(file1, file2, file_out):`
make ebooks when new files are uploaded, clean up old the download bits are suppressed when the ask comes from the pdf seems we weren't making new ebooks when new files were uploaded old ebooks now deactivated so we don't loase download counts and history add ask to pdf is triggered by file upload or THANKS campaign save 2014-09-04 22:33:20 +00:00			`merger = PdfFileMerger(strict=False)`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00			`merger.append(file1)`
			`merger.append(file2)`
			`merger.write(file_out)`
			`merger.close()`

			`def test_pdf(pdf_file):`
			`temp = None`
			`try:`
delint 2019-02-28 21:22:23 +00:00			`if isinstance(pdf_file, (str, unicode)):`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00			`if pdf_file.startswith('http:') or pdf_file.startswith('https:'):`
			`temp = NamedTemporaryFile(delete=False)`
			`test_file_content = requests.get(pdf_file).content`
			`temp.write(test_file_content)`
			`temp.seek(0)`
			`else:`
			`# hope it's already a file`
			`temp = open(pdf_file, mode='rb')`
			`else:`
			`pdf_file.seek(0)`
			`temp = pdf_file`
			`try:`
			`PdfFileReader(temp)`
			`success = True`
			`except:`
			`success = False`
			`return success`
fix pdf tester boto3 storage returns unicode not str someplace 2018-06-08 17:56:07 +00:00			`except Exception:`
			`pdf_file = unicode(pdf_file)`
			`logger.exception('error testing a pdf: %s' % pdf_file[:100])`
campaign.add_ask_to_ebfs for pdfs 2014-08-28 19:29:41 +00:00			`return False`

delint 2019-02-28 21:22:23 +00:00			`def staple_pdf(urllist, user_agent=settings.USER_AGENT):`
add degruyter handling - move harvest to separate module - add ratelimiter class - add pdf stapler - add a googlebot UA - add base url storage in get_soup 2019-02-28 20:32:41 +00:00			`merger = PdfFileMerger(strict=False)`
catch more pdf errors 2019-03-05 17:02:42 +00:00			`s = requests.Session()`
add degruyter handling - move harvest to separate module - add ratelimiter class - add pdf stapler - add a googlebot UA - add base url storage in get_soup 2019-02-28 20:32:41 +00:00			`for url in urllist:`
harden stapler 2019-03-04 22:27:55 +00:00			`try:`
catch more pdf errors 2019-03-05 17:02:42 +00:00			`response = s.get(url, headers={"User-Agent": user_agent})`
harden stapler 2019-03-04 22:27:55 +00:00			`except requests.exceptions.ConnectionError:`
			`logger.error("Error getting url: %s", url)`
			`return None`
add degruyter handling - move harvest to separate module - add ratelimiter class - add pdf stapler - add a googlebot UA - add base url storage in get_soup 2019-02-28 20:32:41 +00:00			`if response.status_code == 200:`
refinements - handle dropbox urls with no params - catch exceptions in stapler - fix dedupe summary 2019-03-03 00:16:47 +00:00			`try:`
			`merger.append(BytesIO(response.content))`
			`except PdfReadError:`
			`logger.error("error reading pdf url: %s", url)`
			`return None`
add degruyter handling - move harvest to separate module - add ratelimiter class - add pdf stapler - add a googlebot UA - add base url storage in get_soup 2019-02-28 20:32:41 +00:00			`else:`
			`return None`
			`out = BytesIO()`
catch more pdf errors 2019-03-05 17:02:42 +00:00			`try:`
			`merger.write(out)`
			`except PdfReadError:`
			`logger.error("error writing pdf url: %s", url)`
			`return None`
add degruyter handling - move harvest to separate module - add ratelimiter class - add pdf stapler - add a googlebot UA - add base url storage in get_soup 2019-02-28 20:32:41 +00:00			`return out`

			`def test_test_pdf():`
delint 2019-02-28 21:22:23 +00:00			`assert(test_pdf(settings.TEST_PDF_URL))`
			`temp = NamedTemporaryFile(delete=False)`
			`test_file_content = requests.get(settings.TEST_PDF_URL).content`
			`temp.write(test_file_content)`
			`assert test_pdf(temp)`
			`temp.close()`