#!/usr/bin/env python # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- """ BaseFormatter.py Copyright 2009-2010 by Marcello Perathoner Distributable under the GNU General Public License Version 3 or newer. Base class for output formatters. """ from __future__ import unicode_literals import base64 import datetime import os import re from six.moves import urllib from genshi.core import _ensure from genshi.core import escape, Markup, QName from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_CDATA, END_CDATA, PI, COMMENT import genshi.output from genshi.output import EMPTY, EmptyTagFilter, WhitespaceFilter, \ NamespaceFlattener, DocTypeInserter import genshi.template import cherrypy from libgutenberg import GutenbergGlobals as gg import BaseSearcher # use a bit more aggressive whitespace removal than the standard whitespace filter COLLAPSE_LINES = re.compile('\n[ \t\r\n]+').sub WHITESPACE_FILTER = genshi.output.WhitespaceFilter() DATA_URL_CACHE = {} class BaseFormatter(object): """ Base class for formatters. """ CONTENT_TYPE = 'text/html; charset=UTF-8' def __init__(self): self.templates = {} def format(self, page, os): """ Abstract method to override. """ pass def get_serializer(self): """ Abstract method to override. Like this: return genshi.output.XMLSerializer(doctype = self.DOCTYPE, strip_whitespace = False) """ pass def send_headers(self): """ Send HTTP content-type header. """ cherrypy.response.headers['Content-Type'] = self.CONTENT_TYPE def render(self, page, os): """ Render and send to browser. """ self.send_headers() template = self.templates[page] ctxt = genshi.template.Context(cherrypy=cherrypy, os=os, bs=BaseSearcher) stream = template.stream for filter_ in template.filters: stream = filter_(iter(stream), ctxt) # there's no easy way in genshi to pass collapse_lines to this filter stream = WHITESPACE_FILTER(stream, collapse_lines=COLLAPSE_LINES) return genshi.output.encode(self.get_serializer()(_ensure(genshi.Stream(stream))), encoding='utf-8') def set_template(self, page, template): """ Set template for page. Override this for special handling of template, like adding filters. """ self.templates[page] = template @staticmethod def format_date(date): """ Format a date. """ if date is None: return '' try: # datetime return date.replace(tzinfo=gg.UTC(), microsecond=0).isoformat() except TypeError: # date return datetime.datetime.combine( date, datetime.time(tzinfo=gg.UTC())).isoformat() @staticmethod def data_url(path): """ Read and convert a file to a data url. """ if path in DATA_URL_CACHE: return DATA_URL_CACHE[path] abs_path = os.path.join('https://' + cherrypy.config['file_host'], path.lstrip('/')) data_url = abs_path try: f = urllib.request.urlopen(abs_path) retcode = f.getcode() if retcode is None or retcode == 200: msg = f.info() mediatype = msg.get('Content-Type') if mediatype: mediatype = mediatype.partition(';')[0] data_url = ('data:' + mediatype + ';base64,' + base64.b64encode(f.read()).decode('ascii')) f.close() except IOError: pass DATA_URL_CACHE[path] = data_url return data_url def fix_dc(self, dc, os): """ Add some info to dc for easier templating. """ # obsolete private marc codes for cover art dc.marcs = [ marc for marc in dc.marcs if not marc.code.startswith('9') ] dc.cover_image = None dc.cover_thumb = None # cover image really should not be a property of opensearch, # but it is accessed in many places and this way we can save a # lot of iterations later os.cover_image_url = None os.cover_thumb_url = None for file_ in dc.files: # HACK for https:// if file_.url.startswith('http://'): file_.url = 'https' + file_.url[4:] file_.dropbox_url = None # file_.dropbox_filename = None file_.gdrive_url = None file_.msdrive_url = None file_.honeypot_url = None if file_.filetype == 'cover.medium': dc.cover_image = file_ os.snippet_image_url = os.cover_image_url = file_.url elif file_.filetype == 'cover.small': dc.cover_thumb = file_ os.cover_thumb_url = file_.url dc.xsd_release_date_time = self.format_date(dc.release_date) if 'Sound' in dc.categories: dc.icon = 'audiobook' # lifted from genshi/output.py and fixed lang issue # lang is not allowed in xhtml 1.1 which we must use # because xhtml+rdfa is based on it class XHTMLSerializer(genshi.output.XMLSerializer): """Produces XHTML text from an event stream. >>> from genshi.builder import tag >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) >>> print(''.join(XHTMLSerializer()(elem.generate())))
""" _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param']) _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap']) _PRESERVE_SPACE = frozenset([ QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') ]) def __init__(self, doctype=None, strip_whitespace=True, namespace_prefixes=None, drop_xml_decl=True, cache=True): super(XHTMLSerializer, self).__init__(doctype, False) self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) namespace_prefixes = namespace_prefixes or {} namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes, cache=cache)) if doctype: self.filters.append(DocTypeInserter(doctype)) self.drop_xml_decl = drop_xml_decl self.cache = cache def __call__(self, stream): boolean_attrs = self._BOOLEAN_ATTRS empty_elems = self._EMPTY_ELEMS drop_xml_decl = self.drop_xml_decl have_decl = have_doctype = False in_cdata = False cache = {} cache_get = cache.get if self.cache: def _emit(kind, input, output): cache[kind, input] = output return output else: def _emit(kind, input, output): return output for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: cached = cache_get((kind, data)) if cached is not None: yield cached elif kind is START or kind is EMPTY: tag, attrib = data buf = ['<', tag] for attr, value in attrib: if attr in boolean_attrs: value = attr # this is the fix # elif attr == 'xml:lang' and 'lang' not in attrib: # buf += [' lang="', escape(value), '"'] elif attr == 'xml:space': continue buf += [' ', attr, '="', escape(value), '"'] if kind is EMPTY: if tag in empty_elems: buf.append(' />') else: buf.append('>%s>' % tag) else: buf.append('>') yield _emit(kind, data, Markup(''.join(buf))) elif kind is END: yield _emit(kind, data, Markup('%s>' % data)) elif kind is TEXT: if in_cdata: yield _emit(kind, data, data) else: yield _emit(kind, data, escape(data, quotes=False)) elif kind is COMMENT: yield _emit(kind, data, Markup('' % data)) elif kind is DOCTYPE and not have_doctype: name, pubid, sysid = data buf = ['\n') yield Markup(''.join(buf)) % tuple([p for p in data if p]) have_doctype = True elif kind is XML_DECL and not have_decl and not drop_xml_decl: version, encoding, standalone = data buf = ['\n') yield Markup(''.join(buf)) have_decl = True elif kind is START_CDATA: yield Markup('') in_cdata = False elif kind is PI: yield _emit(kind, data, Markup('%s %s?>' % data))