2019-03-28 13:45:03 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
|
|
|
|
|
|
|
|
"""
|
|
|
|
BaseFormatter.py
|
|
|
|
|
|
|
|
Copyright 2009-2010 by Marcello Perathoner
|
|
|
|
|
|
|
|
Distributable under the GNU General Public License Version 3 or newer.
|
|
|
|
|
|
|
|
Base class for output formatters.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
import base64
|
2019-03-28 13:45:03 +00:00
|
|
|
import datetime
|
2020-09-15 18:07:09 +00:00
|
|
|
import os
|
2019-03-28 13:45:03 +00:00
|
|
|
import re
|
|
|
|
from six.moves import urllib
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
from genshi.core import _ensure
|
|
|
|
from genshi.core import escape, Markup, QName
|
|
|
|
from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_CDATA, END_CDATA, PI, COMMENT
|
2019-03-28 13:45:03 +00:00
|
|
|
import genshi.output
|
2020-09-15 18:07:09 +00:00
|
|
|
from genshi.output import EMPTY, EmptyTagFilter, WhitespaceFilter, \
|
|
|
|
NamespaceFlattener, DocTypeInserter
|
2019-03-28 13:45:03 +00:00
|
|
|
import genshi.template
|
2020-09-15 18:07:09 +00:00
|
|
|
|
|
|
|
|
2019-03-28 13:45:03 +00:00
|
|
|
import cherrypy
|
|
|
|
|
|
|
|
from libgutenberg import GutenbergGlobals as gg
|
|
|
|
|
|
|
|
import BaseSearcher
|
|
|
|
|
|
|
|
# use a bit more aggressive whitespace removal than the standard whitespace filter
|
|
|
|
COLLAPSE_LINES = re.compile('\n[ \t\r\n]+').sub
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
WHITESPACE_FILTER = genshi.output.WhitespaceFilter()
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
DATA_URL_CACHE = {}
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
class BaseFormatter(object):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Base class for formatters. """
|
|
|
|
|
|
|
|
CONTENT_TYPE = 'text/html; charset=UTF-8'
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
def __init__(self):
|
2019-03-28 13:45:03 +00:00
|
|
|
self.templates = {}
|
|
|
|
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
def format(self, page, os):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Abstract method to override. """
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
def get_serializer(self):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Abstract method to override.
|
|
|
|
|
|
|
|
Like this:
|
2020-09-15 18:07:09 +00:00
|
|
|
return genshi.output.XMLSerializer(doctype = self.DOCTYPE, strip_whitespace = False)
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
def send_headers(self):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Send HTTP content-type header. """
|
|
|
|
cherrypy.response.headers['Content-Type'] = self.CONTENT_TYPE
|
|
|
|
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
def render(self, page, os):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Render and send to browser. """
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
self.send_headers()
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
template = self.templates[page]
|
2020-09-15 18:07:09 +00:00
|
|
|
ctxt = genshi.template.Context(cherrypy=cherrypy, os=os, bs=BaseSearcher)
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
stream = template.stream
|
|
|
|
for filter_ in template.filters:
|
2020-09-15 18:07:09 +00:00
|
|
|
stream = filter_(iter(stream), ctxt)
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
# there's no easy way in genshi to pass collapse_lines to this filter
|
2020-09-15 18:07:09 +00:00
|
|
|
stream = WHITESPACE_FILTER(stream, collapse_lines=COLLAPSE_LINES)
|
2019-03-28 13:45:03 +00:00
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
return genshi.output.encode(self.get_serializer()(_ensure(genshi.Stream(stream))),
|
|
|
|
encoding='utf-8')
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
def set_template(self, page, template):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Set template for page.
|
|
|
|
|
|
|
|
Override this for special handling of template, like adding filters. """
|
|
|
|
self.templates[page] = template
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
2020-09-15 18:07:09 +00:00
|
|
|
def format_date(date):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Format a date. """
|
|
|
|
|
|
|
|
if date is None:
|
|
|
|
return ''
|
|
|
|
|
|
|
|
try:
|
|
|
|
# datetime
|
2020-09-15 18:07:09 +00:00
|
|
|
return date.replace(tzinfo=gg.UTC(), microsecond=0).isoformat()
|
2019-03-28 13:45:03 +00:00
|
|
|
except TypeError:
|
|
|
|
# date
|
2020-09-15 18:07:09 +00:00
|
|
|
return datetime.datetime.combine(
|
|
|
|
date, datetime.time(tzinfo=gg.UTC())).isoformat()
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2020-09-15 18:07:09 +00:00
|
|
|
def data_url(path):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Read and convert a file to a data url. """
|
|
|
|
if path in DATA_URL_CACHE:
|
|
|
|
return DATA_URL_CACHE[path]
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
abs_path = os.path.join('https://' + cherrypy.config['file_host'], path.lstrip('/'))
|
2019-03-28 13:45:03 +00:00
|
|
|
data_url = abs_path
|
|
|
|
try:
|
2020-09-15 18:07:09 +00:00
|
|
|
f = urllib.request.urlopen(abs_path)
|
|
|
|
retcode = f.getcode()
|
2019-03-28 13:45:03 +00:00
|
|
|
if retcode is None or retcode == 200:
|
2020-09-15 18:07:09 +00:00
|
|
|
msg = f.info()
|
|
|
|
mediatype = msg.get('Content-Type')
|
2019-03-28 13:45:03 +00:00
|
|
|
if mediatype:
|
2020-09-15 18:07:09 +00:00
|
|
|
mediatype = mediatype.partition(';')[0]
|
2019-03-28 13:45:03 +00:00
|
|
|
data_url = ('data:' + mediatype + ';base64,' +
|
2020-09-15 18:07:09 +00:00
|
|
|
base64.b64encode(f.read()).decode('ascii'))
|
|
|
|
f.close()
|
2019-03-28 13:45:03 +00:00
|
|
|
except IOError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
DATA_URL_CACHE[path] = data_url
|
|
|
|
return data_url
|
|
|
|
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
def fix_dc(self, dc, os):
|
2019-03-28 13:45:03 +00:00
|
|
|
""" Add some info to dc for easier templating. """
|
|
|
|
|
|
|
|
# obsolete private marc codes for cover art
|
2020-09-15 18:07:09 +00:00
|
|
|
dc.marcs = [ marc for marc in dc.marcs if not marc.code.startswith('9') ]
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
dc.cover_image = None
|
|
|
|
dc.cover_thumb = None
|
|
|
|
# cover image really should not be a property of opensearch,
|
|
|
|
# but it is accessed in many places and this way we can save a
|
|
|
|
# lot of iterations later
|
2020-09-15 18:07:09 +00:00
|
|
|
os.cover_image_url = None
|
|
|
|
os.cover_thumb_url = None
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
for file_ in dc.files:
|
|
|
|
|
|
|
|
# HACK for https://
|
2020-09-15 18:07:09 +00:00
|
|
|
if file_.url.startswith('http://'):
|
|
|
|
file_.url = 'https' + file_.url[4:]
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
file_.dropbox_url = None
|
|
|
|
# file_.dropbox_filename = None
|
|
|
|
file_.gdrive_url = None
|
|
|
|
file_.msdrive_url = None
|
|
|
|
file_.honeypot_url = None
|
|
|
|
|
|
|
|
if file_.filetype == 'cover.medium':
|
|
|
|
dc.cover_image = file_
|
|
|
|
os.snippet_image_url = os.cover_image_url = file_.url
|
|
|
|
elif file_.filetype == 'cover.small':
|
|
|
|
dc.cover_thumb = file_
|
|
|
|
os.cover_thumb_url = file_.url
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
dc.xsd_release_date_time = self.format_date(dc.release_date)
|
2019-03-28 13:45:03 +00:00
|
|
|
|
|
|
|
if 'Sound' in dc.categories:
|
|
|
|
dc.icon = 'audiobook'
|
|
|
|
|
|
|
|
|
|
|
|
# lifted from genshi/output.py and fixed lang issue
|
|
|
|
# lang is not allowed in xhtml 1.1 which we must use
|
|
|
|
# because xhtml+rdfa is based on it
|
|
|
|
|
2020-09-15 18:07:09 +00:00
|
|
|
class XHTMLSerializer(genshi.output.XMLSerializer):
|
2019-03-28 13:45:03 +00:00
|
|
|
"""Produces XHTML text from an event stream.
|
|
|
|
|
|
|
|
>>> from genshi.builder import tag
|
|
|
|
>>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
|
|
|
|
>>> print(''.join(XHTMLSerializer()(elem.generate())))
|
|
|
|
<div><a href="foo"></a><br /><hr noshade="noshade" /></div>
|
|
|
|
"""
|
|
|
|
|
|
|
|
_EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
|
|
|
|
'hr', 'img', 'input', 'isindex', 'link', 'meta',
|
|
|
|
'param'])
|
|
|
|
_BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
|
|
|
|
'defer', 'disabled', 'ismap', 'multiple',
|
|
|
|
'nohref', 'noresize', 'noshade', 'nowrap'])
|
|
|
|
_PRESERVE_SPACE = frozenset([
|
|
|
|
QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),
|
|
|
|
QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')
|
|
|
|
])
|
|
|
|
|
|
|
|
def __init__(self, doctype=None, strip_whitespace=True,
|
|
|
|
namespace_prefixes=None, drop_xml_decl=True, cache=True):
|
|
|
|
super(XHTMLSerializer, self).__init__(doctype, False)
|
|
|
|
self.filters = [EmptyTagFilter()]
|
|
|
|
if strip_whitespace:
|
|
|
|
self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
|
|
|
|
namespace_prefixes = namespace_prefixes or {}
|
|
|
|
namespace_prefixes['http://www.w3.org/1999/xhtml'] = ''
|
|
|
|
self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes,
|
|
|
|
cache=cache))
|
|
|
|
if doctype:
|
|
|
|
self.filters.append(DocTypeInserter(doctype))
|
|
|
|
self.drop_xml_decl = drop_xml_decl
|
|
|
|
self.cache = cache
|
|
|
|
|
|
|
|
def __call__(self, stream):
|
|
|
|
boolean_attrs = self._BOOLEAN_ATTRS
|
|
|
|
empty_elems = self._EMPTY_ELEMS
|
|
|
|
drop_xml_decl = self.drop_xml_decl
|
|
|
|
have_decl = have_doctype = False
|
|
|
|
in_cdata = False
|
|
|
|
|
|
|
|
cache = {}
|
|
|
|
cache_get = cache.get
|
|
|
|
if self.cache:
|
|
|
|
def _emit(kind, input, output):
|
|
|
|
cache[kind, input] = output
|
|
|
|
return output
|
|
|
|
else:
|
|
|
|
def _emit(kind, input, output):
|
|
|
|
return output
|
|
|
|
|
|
|
|
for filter_ in self.filters:
|
|
|
|
stream = filter_(stream)
|
|
|
|
for kind, data, pos in stream:
|
|
|
|
cached = cache_get((kind, data))
|
|
|
|
if cached is not None:
|
|
|
|
yield cached
|
|
|
|
|
|
|
|
elif kind is START or kind is EMPTY:
|
|
|
|
tag, attrib = data
|
|
|
|
buf = ['<', tag]
|
|
|
|
for attr, value in attrib:
|
|
|
|
if attr in boolean_attrs:
|
|
|
|
value = attr
|
|
|
|
# this is the fix
|
|
|
|
# elif attr == 'xml:lang' and 'lang' not in attrib:
|
|
|
|
# buf += [' lang="', escape(value), '"']
|
|
|
|
elif attr == 'xml:space':
|
|
|
|
continue
|
|
|
|
buf += [' ', attr, '="', escape(value), '"']
|
|
|
|
if kind is EMPTY:
|
|
|
|
if tag in empty_elems:
|
|
|
|
buf.append(' />')
|
|
|
|
else:
|
|
|
|
buf.append('></%s>' % tag)
|
|
|
|
else:
|
|
|
|
buf.append('>')
|
|
|
|
yield _emit(kind, data, Markup(''.join(buf)))
|
|
|
|
|
|
|
|
elif kind is END:
|
|
|
|
yield _emit(kind, data, Markup('</%s>' % data))
|
|
|
|
|
|
|
|
elif kind is TEXT:
|
|
|
|
if in_cdata:
|
|
|
|
yield _emit(kind, data, data)
|
|
|
|
else:
|
|
|
|
yield _emit(kind, data, escape(data, quotes=False))
|
|
|
|
|
|
|
|
elif kind is COMMENT:
|
|
|
|
yield _emit(kind, data, Markup('<!--%s-->' % data))
|
|
|
|
|
|
|
|
elif kind is DOCTYPE and not have_doctype:
|
|
|
|
name, pubid, sysid = data
|
|
|
|
buf = ['<!DOCTYPE %s']
|
|
|
|
if pubid:
|
|
|
|
buf.append(' PUBLIC "%s"')
|
|
|
|
elif sysid:
|
|
|
|
buf.append(' SYSTEM')
|
|
|
|
if sysid:
|
|
|
|
buf.append(' "%s"')
|
|
|
|
buf.append('>\n')
|
|
|
|
yield Markup(''.join(buf)) % tuple([p for p in data if p])
|
|
|
|
have_doctype = True
|
|
|
|
|
|
|
|
elif kind is XML_DECL and not have_decl and not drop_xml_decl:
|
|
|
|
version, encoding, standalone = data
|
|
|
|
buf = ['<?xml version="%s"' % version]
|
|
|
|
if encoding:
|
|
|
|
buf.append(' encoding="%s"' % encoding)
|
|
|
|
if standalone != -1:
|
|
|
|
standalone = standalone and 'yes' or 'no'
|
|
|
|
buf.append(' standalone="%s"' % standalone)
|
|
|
|
buf.append('?>\n')
|
|
|
|
yield Markup(''.join(buf))
|
|
|
|
have_decl = True
|
|
|
|
|
|
|
|
elif kind is START_CDATA:
|
|
|
|
yield Markup('<![CDATA[')
|
|
|
|
in_cdata = True
|
|
|
|
|
|
|
|
elif kind is END_CDATA:
|
|
|
|
yield Markup(']]>')
|
|
|
|
in_cdata = False
|
|
|
|
|
|
|
|
elif kind is PI:
|
|
|
|
yield _emit(kind, data, Markup('<?%s %s?>' % data))
|