regluit/core/pdf.py

98 lines
2.9 KiB
Python
Raw Normal View History

2014-08-28 19:29:41 +00:00
"""
Utilities that manipulate pdf files
"""
import logging
from io import BytesIO
2019-02-28 21:22:23 +00:00
from StringIO import StringIO
2014-08-28 19:29:41 +00:00
from tempfile import NamedTemporaryFile
2019-02-28 21:22:23 +00:00
import requests
from xhtml2pdf import pisa # import python module
from PyPDF2 import PdfFileMerger, PdfFileReader
from PyPDF2.utils import PdfReadError
2014-08-28 19:29:41 +00:00
from django.template.loader import render_to_string
2019-02-28 21:22:23 +00:00
from django.conf import settings
2014-08-28 19:29:41 +00:00
logger = logging.getLogger(__name__)
2014-08-28 19:29:41 +00:00
# Utility function
def ask_pdf(context={}):
ask_html = StringIO(unicode(render_to_string('pdf/ask.html', context)))
# open output file for writing (truncated binary)
resultFile = StringIO()
# convert HTML to PDF
pisaStatus = pisa.CreatePDF(
2019-02-28 21:22:23 +00:00
src=ask_html, # the HTML to convert
dest=resultFile, # file to recieve result
)
2014-08-28 19:29:41 +00:00
# True on success and False on errors
assert pisaStatus.err == 0
return resultFile
2019-02-28 21:22:23 +00:00
def pdf_append(file1, file2, file_out):
merger = PdfFileMerger(strict=False)
2014-08-28 19:29:41 +00:00
merger.append(file1)
merger.append(file2)
merger.write(file_out)
merger.close()
def test_pdf(pdf_file):
temp = None
try:
2019-02-28 21:22:23 +00:00
if isinstance(pdf_file, (str, unicode)):
2014-08-28 19:29:41 +00:00
if pdf_file.startswith('http:') or pdf_file.startswith('https:'):
temp = NamedTemporaryFile(delete=False)
test_file_content = requests.get(pdf_file).content
temp.write(test_file_content)
temp.seek(0)
else:
# hope it's already a file
temp = open(pdf_file, mode='rb')
else:
pdf_file.seek(0)
temp = pdf_file
try:
PdfFileReader(temp)
success = True
except:
success = False
return success
except Exception:
pdf_file = unicode(pdf_file)
logger.exception('error testing a pdf: %s' % pdf_file[:100])
2014-08-28 19:29:41 +00:00
return False
2019-02-28 21:22:23 +00:00
def staple_pdf(urllist, user_agent=settings.USER_AGENT):
merger = PdfFileMerger(strict=False)
2019-03-05 17:02:42 +00:00
s = requests.Session()
for url in urllist:
2019-03-04 22:27:55 +00:00
try:
2019-03-05 17:02:42 +00:00
response = s.get(url, headers={"User-Agent": user_agent})
2019-03-04 22:27:55 +00:00
except requests.exceptions.ConnectionError:
logger.error("Error getting url: %s", url)
return None
if response.status_code == 200:
try:
merger.append(BytesIO(response.content))
except PdfReadError:
logger.error("error reading pdf url: %s", url)
return None
else:
return None
out = BytesIO()
2019-03-05 17:02:42 +00:00
try:
merger.write(out)
except PdfReadError:
logger.error("error writing pdf url: %s", url)
return None
return out
def test_test_pdf():
2019-02-28 21:22:23 +00:00
assert(test_pdf(settings.TEST_PDF_URL))
temp = NamedTemporaryFile(delete=False)
test_file_content = requests.get(settings.TEST_PDF_URL).content
temp.write(test_file_content)
assert test_pdf(temp)
temp.close()