autocat3/BaseFormatter.py

#!/usr/bin/env python
#  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-

"""
BaseFormatter.py

Copyright 2009-2010 by Marcello Perathoner

Distributable under the GNU General Public License Version 3 or newer.

Base class for output formatters.

"""

from __future__ import unicode_literals

import base64
import datetime
import os
import re
from six.moves import urllib

from genshi.core import _ensure
from genshi.core import escape, Markup, QName
from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_CDATA, END_CDATA, PI, COMMENT
import genshi.output
from genshi.output import EMPTY, EmptyTagFilter, WhitespaceFilter, \
                          NamespaceFlattener, DocTypeInserter
import genshi.template


import cherrypy

from libgutenberg import GutenbergGlobals as gg

import BaseSearcher

# use a bit more aggressive whitespace removal than the standard whitespace filter
COLLAPSE_LINES = re.compile('\n[ \t\r\n]+').sub

WHITESPACE_FILTER = genshi.output.WhitespaceFilter()

DATA_URL_CACHE = {}

class BaseFormatter(object):
    """ Base class for formatters. """

    CONTENT_TYPE = 'text/html; charset=UTF-8'

    def __init__(self):
        self.templates = {}


    def format(self, page, os):
        """ Abstract method to override. """
        pass


    def get_serializer(self):
        """ Abstract method to override.

        Like this:
        return genshi.output.XMLSerializer(doctype = self.DOCTYPE, strip_whitespace = False)

        """
        pass


    def send_headers(self):
        """ Send HTTP content-type header. """
        cherrypy.response.headers['Content-Type'] = self.CONTENT_TYPE


    def render(self, page, os):
        """ Render and send to browser. """

        self.send_headers()

        template = self.templates[page]
        ctxt = genshi.template.Context(cherrypy=cherrypy, os=os, bs=BaseSearcher)

        stream = template.stream
        for filter_ in template.filters:
            stream = filter_(iter(stream), ctxt)

        # there's no easy way in genshi to pass collapse_lines to this filter
        stream = WHITESPACE_FILTER(stream, collapse_lines=COLLAPSE_LINES)

        return genshi.output.encode(self.get_serializer()(_ensure(genshi.Stream(stream))),
                                     encoding='utf-8')


    def set_template(self, page, template):
        """ Set template for page.

        Override this for special handling of template, like adding filters. """
        self.templates[page] = template


    @staticmethod
    def format_date(date):
        """ Format a date. """

        if date is None:
            return ''

        try:
            # datetime
            return date.replace(tzinfo=gg.UTC(), microsecond=0).isoformat()
        except TypeError:
            # date
            return datetime.datetime.combine(
                date, datetime.time(tzinfo=gg.UTC())).isoformat()

    @staticmethod
    def data_url(path):
        """ Read and convert a file to a data url. """
        if path in DATA_URL_CACHE:
            return DATA_URL_CACHE[path]

        abs_path = os.path.join('https://' + cherrypy.config['file_host'], path.lstrip('/'))
        data_url = abs_path
        try:
            f = urllib.request.urlopen(abs_path)
            retcode = f.getcode()
            if retcode is None or retcode == 200:
                msg = f.info()
                mediatype = msg.get('Content-Type')
                if mediatype:
                    mediatype = mediatype.partition(';')[0]
                    data_url = ('data:' + mediatype + ';base64,' +
                                base64.b64encode(f.read()).decode('ascii'))
            f.close()
        except IOError:
            pass

        DATA_URL_CACHE[path] = data_url
        return data_url


    def fix_dc(self, dc, os):
        """ Add some info to dc for easier templating. """

        # obsolete private marc codes for cover art
        dc.marcs = [ marc for marc in dc.marcs if not marc.code.startswith('9') ]

        dc.cover_image = None
        dc.cover_thumb = None
        # cover image really should not be a property of opensearch,
        # but it is accessed in many places and this way we can save a
        # lot of iterations later
        os.cover_image_url = None
        os.cover_thumb_url = None

        for file_ in dc.files:

            # HACK for https://
            if file_.url.startswith('http://'):
                file_.url = 'https' + file_.url[4:]

            file_.dropbox_url = None
            # file_.dropbox_filename = None
            file_.gdrive_url = None
            file_.msdrive_url = None
            file_.honeypot_url = None

            if file_.filetype == 'cover.medium':
                dc.cover_image = file_
                os.snippet_image_url = os.cover_image_url = file_.url
            elif file_.filetype == 'cover.small':
                dc.cover_thumb = file_
                os.cover_thumb_url = file_.url

            dc.xsd_release_date_time = self.format_date(dc.release_date)

        if 'Sound' in dc.categories:
            dc.icon = 'audiobook'


# lifted from genshi/output.py and fixed lang issue
# lang is not allowed in xhtml 1.1 which we must use
# because xhtml+rdfa is based on it

class XHTMLSerializer(genshi.output.XMLSerializer):
    """Produces XHTML text from an event stream.

    >>> from genshi.builder import tag
    >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
    >>> print(''.join(XHTMLSerializer()(elem.generate())))
    <div><a href="foo"></a><br /><hr noshade="noshade" /></div>
    """

    _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
                              'hr', 'img', 'input', 'isindex', 'link', 'meta',
                              'param'])
    _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
                                'defer', 'disabled', 'ismap', 'multiple',
                                'nohref', 'noresize', 'noshade', 'nowrap'])
    _PRESERVE_SPACE = frozenset([
        QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),
        QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')
    ])

    def __init__(self, doctype=None, strip_whitespace=True,
                 namespace_prefixes=None, drop_xml_decl=True, cache=True):
        super(XHTMLSerializer, self).__init__(doctype, False)
        self.filters = [EmptyTagFilter()]
        if strip_whitespace:
            self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
        namespace_prefixes = namespace_prefixes or {}
        namespace_prefixes['http://www.w3.org/1999/xhtml'] = ''
        self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes,
                                               cache=cache))
        if doctype:
            self.filters.append(DocTypeInserter(doctype))
        self.drop_xml_decl = drop_xml_decl
        self.cache = cache

    def __call__(self, stream):
        boolean_attrs = self._BOOLEAN_ATTRS
        empty_elems = self._EMPTY_ELEMS
        drop_xml_decl = self.drop_xml_decl
        have_decl = have_doctype = False
        in_cdata = False

        cache = {}
        cache_get = cache.get
        if self.cache:
            def _emit(kind, input, output):
                cache[kind, input] = output
                return output
        else:
            def _emit(kind, input, output):
                return output

        for filter_ in self.filters:
            stream = filter_(stream)
        for kind, data, pos in stream:
            cached = cache_get((kind, data))
            if cached is not None:
                yield cached

            elif kind is START or kind is EMPTY:
                tag, attrib = data
                buf = ['<', tag]
                for attr, value in attrib:
                    if attr in boolean_attrs:
                        value = attr
                    # this is the fix
                    # elif attr == 'xml:lang' and 'lang' not in attrib:
                    #     buf += [' lang="', escape(value), '"']
                    elif attr == 'xml:space':
                        continue
                    buf += [' ', attr, '="', escape(value), '"']
                if kind is EMPTY:
                    if tag in empty_elems:
                        buf.append(' />')
                    else:
                        buf.append('></%s>' % tag)
                else:
                    buf.append('>')
                yield _emit(kind, data, Markup(''.join(buf)))

            elif kind is END:
                yield _emit(kind, data, Markup('</%s>' % data))

            elif kind is TEXT:
                if in_cdata:
                    yield _emit(kind, data, data)
                else:
                    yield _emit(kind, data, escape(data, quotes=False))

            elif kind is COMMENT:
                yield _emit(kind, data, Markup('<!--%s-->' % data))

            elif kind is DOCTYPE and not have_doctype:
                name, pubid, sysid = data
                buf = ['<!DOCTYPE %s']
                if pubid:
                    buf.append(' PUBLIC "%s"')
                elif sysid:
                    buf.append(' SYSTEM')
                if sysid:
                    buf.append(' "%s"')
                buf.append('>\n')
                yield Markup(''.join(buf)) % tuple([p for p in data if p])
                have_doctype = True

            elif kind is XML_DECL and not have_decl and not drop_xml_decl:
                version, encoding, standalone = data
                buf = ['<?xml version="%s"' % version]
                if encoding:
                    buf.append(' encoding="%s"' % encoding)
                if standalone != -1:
                    standalone = standalone and 'yes' or 'no'
                    buf.append(' standalone="%s"' % standalone)
                buf.append('?>\n')
                yield Markup(''.join(buf))
                have_decl = True

            elif kind is START_CDATA:
                yield Markup('<![CDATA[')
                in_cdata = True

            elif kind is END_CDATA:
                yield Markup(']]>')
                in_cdata = False

            elif kind is PI:
                yield _emit(kind, data, Markup('<?%s %s?>' % data))
initial commit 2019-03-28 13:45:03 +00:00			`#!/usr/bin/env python`
			`# -- mode: python; indent-tabs-mode: nil; -- coding: iso-8859-1 -*-`

			`"""`
			`BaseFormatter.py`

			`Copyright 2009-2010 by Marcello Perathoner`

			`Distributable under the GNU General Public License Version 3 or newer.`

			`Base class for output formatters.`

			`"""`

			`from __future__ import unicode_literals`

delint 2020-09-15 18:07:09 +00:00			`import base64`
initial commit 2019-03-28 13:45:03 +00:00			`import datetime`
delint 2020-09-15 18:07:09 +00:00			`import os`
initial commit 2019-03-28 13:45:03 +00:00			`import re`
			`from six.moves import urllib`

delint 2020-09-15 18:07:09 +00:00			`from genshi.core import _ensure`
			`from genshi.core import escape, Markup, QName`
			`from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_CDATA, END_CDATA, PI, COMMENT`
initial commit 2019-03-28 13:45:03 +00:00			`import genshi.output`
delint 2020-09-15 18:07:09 +00:00			`from genshi.output import EMPTY, EmptyTagFilter, WhitespaceFilter, \`
			`NamespaceFlattener, DocTypeInserter`
initial commit 2019-03-28 13:45:03 +00:00			`import genshi.template`
delint 2020-09-15 18:07:09 +00:00

initial commit 2019-03-28 13:45:03 +00:00			`import cherrypy`

			`from libgutenberg import GutenbergGlobals as gg`

			`import BaseSearcher`

			`# use a bit more aggressive whitespace removal than the standard whitespace filter`
			`COLLAPSE_LINES = re.compile('\n[ \t\r\n]+').sub`

delint 2020-09-15 18:07:09 +00:00			`WHITESPACE_FILTER = genshi.output.WhitespaceFilter()`
initial commit 2019-03-28 13:45:03 +00:00
			`DATA_URL_CACHE = {}`

delint 2020-09-15 18:07:09 +00:00			`class BaseFormatter(object):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Base class for formatters. """`

			`CONTENT_TYPE = 'text/html; charset=UTF-8'`

delint 2020-09-15 18:07:09 +00:00			`def __init__(self):`
initial commit 2019-03-28 13:45:03 +00:00			`self.templates = {}`


delint 2020-09-15 18:07:09 +00:00			`def format(self, page, os):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Abstract method to override. """`
			`pass`


delint 2020-09-15 18:07:09 +00:00			`def get_serializer(self):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Abstract method to override.`

			`Like this:`
delint 2020-09-15 18:07:09 +00:00			`return genshi.output.XMLSerializer(doctype = self.DOCTYPE, strip_whitespace = False)`
initial commit 2019-03-28 13:45:03 +00:00
			`"""`
			`pass`


delint 2020-09-15 18:07:09 +00:00			`def send_headers(self):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Send HTTP content-type header. """`
			`cherrypy.response.headers['Content-Type'] = self.CONTENT_TYPE`


delint 2020-09-15 18:07:09 +00:00			`def render(self, page, os):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Render and send to browser. """`

delint 2020-09-15 18:07:09 +00:00			`self.send_headers()`
initial commit 2019-03-28 13:45:03 +00:00
			`template = self.templates[page]`
delint 2020-09-15 18:07:09 +00:00			`ctxt = genshi.template.Context(cherrypy=cherrypy, os=os, bs=BaseSearcher)`
initial commit 2019-03-28 13:45:03 +00:00
			`stream = template.stream`
			`for filter_ in template.filters:`
delint 2020-09-15 18:07:09 +00:00			`stream = filter_(iter(stream), ctxt)`
initial commit 2019-03-28 13:45:03 +00:00
			`# there's no easy way in genshi to pass collapse_lines to this filter`
delint 2020-09-15 18:07:09 +00:00			`stream = WHITESPACE_FILTER(stream, collapse_lines=COLLAPSE_LINES)`
initial commit 2019-03-28 13:45:03 +00:00
delint 2020-09-15 18:07:09 +00:00			`return genshi.output.encode(self.get_serializer()(_ensure(genshi.Stream(stream))),`
			`encoding='utf-8')`
initial commit 2019-03-28 13:45:03 +00:00

delint 2020-09-15 18:07:09 +00:00			`def set_template(self, page, template):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Set template for page.`

			`Override this for special handling of template, like adding filters. """`
			`self.templates[page] = template`


			`@staticmethod`
delint 2020-09-15 18:07:09 +00:00			`def format_date(date):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Format a date. """`

			`if date is None:`
			`return ''`

			`try:`
			`# datetime`
delint 2020-09-15 18:07:09 +00:00			`return date.replace(tzinfo=gg.UTC(), microsecond=0).isoformat()`
initial commit 2019-03-28 13:45:03 +00:00			`except TypeError:`
			`# date`
delint 2020-09-15 18:07:09 +00:00			`return datetime.datetime.combine(`
			`date, datetime.time(tzinfo=gg.UTC())).isoformat()`
initial commit 2019-03-28 13:45:03 +00:00
			`@staticmethod`
delint 2020-09-15 18:07:09 +00:00			`def data_url(path):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Read and convert a file to a data url. """`
			`if path in DATA_URL_CACHE:`
			`return DATA_URL_CACHE[path]`

delint 2020-09-15 18:07:09 +00:00			`abs_path = os.path.join('https://' + cherrypy.config['file_host'], path.lstrip('/'))`
initial commit 2019-03-28 13:45:03 +00:00			`data_url = abs_path`
			`try:`
delint 2020-09-15 18:07:09 +00:00			`f = urllib.request.urlopen(abs_path)`
			`retcode = f.getcode()`
initial commit 2019-03-28 13:45:03 +00:00			`if retcode is None or retcode == 200:`
delint 2020-09-15 18:07:09 +00:00			`msg = f.info()`
			`mediatype = msg.get('Content-Type')`
initial commit 2019-03-28 13:45:03 +00:00			`if mediatype:`
delint 2020-09-15 18:07:09 +00:00			`mediatype = mediatype.partition(';')[0]`
initial commit 2019-03-28 13:45:03 +00:00			`data_url = ('data:' + mediatype + ';base64,' +`
delint 2020-09-15 18:07:09 +00:00			`base64.b64encode(f.read()).decode('ascii'))`
			`f.close()`
initial commit 2019-03-28 13:45:03 +00:00			`except IOError:`
			`pass`

			`DATA_URL_CACHE[path] = data_url`
			`return data_url`


delint 2020-09-15 18:07:09 +00:00			`def fix_dc(self, dc, os):`
initial commit 2019-03-28 13:45:03 +00:00			`""" Add some info to dc for easier templating. """`

			`# obsolete private marc codes for cover art`
delint 2020-09-15 18:07:09 +00:00			`dc.marcs = [ marc for marc in dc.marcs if not marc.code.startswith('9') ]`
initial commit 2019-03-28 13:45:03 +00:00
			`dc.cover_image = None`
			`dc.cover_thumb = None`
			`# cover image really should not be a property of opensearch,`
			`# but it is accessed in many places and this way we can save a`
			`# lot of iterations later`
delint 2020-09-15 18:07:09 +00:00			`os.cover_image_url = None`
			`os.cover_thumb_url = None`
initial commit 2019-03-28 13:45:03 +00:00
			`for file_ in dc.files:`

			`# HACK for https://`
delint 2020-09-15 18:07:09 +00:00			`if file_.url.startswith('http://'):`
			`file_.url = 'https' + file_.url[4:]`
initial commit 2019-03-28 13:45:03 +00:00
			`file_.dropbox_url = None`
			`# file_.dropbox_filename = None`
			`file_.gdrive_url = None`
			`file_.msdrive_url = None`
			`file_.honeypot_url = None`

			`if file_.filetype == 'cover.medium':`
			`dc.cover_image = file_`
			`os.snippet_image_url = os.cover_image_url = file_.url`
			`elif file_.filetype == 'cover.small':`
			`dc.cover_thumb = file_`
			`os.cover_thumb_url = file_.url`

delint 2020-09-15 18:07:09 +00:00			`dc.xsd_release_date_time = self.format_date(dc.release_date)`
initial commit 2019-03-28 13:45:03 +00:00
			`if 'Sound' in dc.categories:`
			`dc.icon = 'audiobook'`


			`# lifted from genshi/output.py and fixed lang issue`
			`# lang is not allowed in xhtml 1.1 which we must use`
			`# because xhtml+rdfa is based on it`

delint 2020-09-15 18:07:09 +00:00			`class XHTMLSerializer(genshi.output.XMLSerializer):`
initial commit 2019-03-28 13:45:03 +00:00			`"""Produces XHTML text from an event stream.`

			`>>> from genshi.builder import tag`
			`>>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))`
			`>>> print(''.join(XHTMLSerializer()(elem.generate())))`
			`<div><a href="foo"></a><br /><hr noshade="noshade" /></div>`
			`"""`

			`_EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',`
			`'hr', 'img', 'input', 'isindex', 'link', 'meta',`
			`'param'])`
			`_BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',`
			`'defer', 'disabled', 'ismap', 'multiple',`
			`'nohref', 'noresize', 'noshade', 'nowrap'])`
			`_PRESERVE_SPACE = frozenset([`
			`QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),`
			`QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')`
			`])`

			`def __init__(self, doctype=None, strip_whitespace=True,`
			`namespace_prefixes=None, drop_xml_decl=True, cache=True):`
			`super(XHTMLSerializer, self).__init__(doctype, False)`
			`self.filters = [EmptyTagFilter()]`
			`if strip_whitespace:`
			`self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))`
			`namespace_prefixes = namespace_prefixes or {}`
			`namespace_prefixes['http://www.w3.org/1999/xhtml'] = ''`
			`self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes,`
			`cache=cache))`
			`if doctype:`
			`self.filters.append(DocTypeInserter(doctype))`
			`self.drop_xml_decl = drop_xml_decl`
			`self.cache = cache`

			`def __call__(self, stream):`
			`boolean_attrs = self._BOOLEAN_ATTRS`
			`empty_elems = self._EMPTY_ELEMS`
			`drop_xml_decl = self.drop_xml_decl`
			`have_decl = have_doctype = False`
			`in_cdata = False`

			`cache = {}`
			`cache_get = cache.get`
			`if self.cache:`
			`def _emit(kind, input, output):`
			`cache[kind, input] = output`
			`return output`
			`else:`
			`def _emit(kind, input, output):`
			`return output`

			`for filter_ in self.filters:`
			`stream = filter_(stream)`
			`for kind, data, pos in stream:`
			`cached = cache_get((kind, data))`
			`if cached is not None:`
			`yield cached`

			`elif kind is START or kind is EMPTY:`
			`tag, attrib = data`
			`buf = ['<', tag]`
			`for attr, value in attrib:`
			`if attr in boolean_attrs:`
			`value = attr`
			`# this is the fix`
			`# elif attr == 'xml:lang' and 'lang' not in attrib:`
			`# buf += [' lang="', escape(value), '"']`
			`elif attr == 'xml:space':`
			`continue`
			`buf += [' ', attr, '="', escape(value), '"']`
			`if kind is EMPTY:`
			`if tag in empty_elems:`
			`buf.append(' />')`
			`else:`
			`buf.append('></%s>' % tag)`
			`else:`
			`buf.append('>')`
			`yield _emit(kind, data, Markup(''.join(buf)))`

			`elif kind is END:`
			`yield _emit(kind, data, Markup('</%s>' % data))`

			`elif kind is TEXT:`
			`if in_cdata:`
			`yield _emit(kind, data, data)`
			`else:`
			`yield _emit(kind, data, escape(data, quotes=False))`

			`elif kind is COMMENT:`
			`yield _emit(kind, data, Markup('<!--%s-->' % data))`

			`elif kind is DOCTYPE and not have_doctype:`
			`name, pubid, sysid = data`
			`buf = ['<!DOCTYPE %s']`
			`if pubid:`
			`buf.append(' PUBLIC "%s"')`
			`elif sysid:`
			`buf.append(' SYSTEM')`
			`if sysid:`
			`buf.append(' "%s"')`
			`buf.append('>\n')`
			`yield Markup(''.join(buf)) % tuple([p for p in data if p])`
			`have_doctype = True`

			`elif kind is XML_DECL and not have_decl and not drop_xml_decl:`
			`version, encoding, standalone = data`
			`buf = ['<?xml version="%s"' % version]`
			`if encoding:`
			`buf.append(' encoding="%s"' % encoding)`
			`if standalone != -1:`
			`standalone = standalone and 'yes' or 'no'`
			`buf.append(' standalone="%s"' % standalone)`
			`buf.append('?>\n')`
			`yield Markup(''.join(buf))`
			`have_decl = True`

			`elif kind is START_CDATA:`
			`yield Markup('<![CDATA[')`
			`in_cdata = True`

			`elif kind is END_CDATA:`
			`yield Markup(']]>')`
			`in_cdata = False`

			`elif kind is PI:`
			`yield _emit(kind, data, Markup('<?%s %s?>' % data))`