regluit/api/opds.py

386 lines
13 KiB
Python
Raw Normal View History

import datetime
2020-03-27 17:48:18 +00:00
from itertools import islice
import logging
2020-02-17 21:34:02 +00:00
from urllib.parse import urlparse, urlunparse
2020-03-27 17:48:18 +00:00
from bs4 import BeautifulSoup
import pytz
2018-07-24 15:33:39 +00:00
from django.urls import reverse
2015-01-14 23:04:17 +00:00
from django.utils.http import urlquote
2014-12-05 23:38:04 +00:00
from regluit.core import models, facets
import regluit.core.cc as cc
licenses = cc.LICENSE_LIST
2020-03-27 17:48:18 +00:00
logger = logging.getLogger(__name__)
2020-04-02 18:09:34 +00:00
soup = None
FORMAT_TO_MIMETYPE = {'pdf':"application/pdf",
'epub':"application/epub+zip",
'mobi':"application/x-mobipocket-ebook",
'html':"text/html",
'text':"text/html"}
2014-07-17 04:33:43 +00:00
2020-03-27 18:23:43 +00:00
UNGLUEIT_URL = 'https://unglue.it'
2015-01-14 23:04:17 +00:00
ACQUISITION = "application/atom+xml;profile=opds-catalog;kind=acquisition"
2014-12-06 20:00:23 +00:00
FACET_RELATION = "http://opds-spec.org/facet"
2014-12-05 23:38:04 +00:00
2020-03-27 18:23:43 +00:00
old_facets = ["creative_commons", "active_campaigns"]
2014-12-05 23:38:04 +00:00
2014-07-17 04:00:16 +00:00
def feeds():
2014-12-05 23:38:04 +00:00
for facet in old_facets:
yield globals()[facet]
2014-12-06 01:37:51 +00:00
for facet_path in facets.get_all_facets('Format'):
2014-12-05 23:38:04 +00:00
yield get_facet_facet(facet_path)
for facet_path in facets.get_all_facets('Keyword'):
yield get_facet_facet(facet_path)
2014-12-05 23:38:04 +00:00
def get_facet_class(name):
if name in old_facets:
return globals()[name]
2020-03-27 18:23:43 +00:00
return get_facet_facet(name)
def text_node(tag, text):
2020-03-27 17:48:18 +00:00
node = soup.new_tag(tag)
2020-04-01 21:18:37 +00:00
if text:
node.string = text
return node
2015-01-14 23:04:17 +00:00
def html_node(tag, html):
node = text_node(tag, html)
2020-03-27 17:48:18 +00:00
node.attrs.update({"type":'html'})
2015-01-14 23:04:17 +00:00
return node
2020-03-27 18:23:43 +00:00
def add_query_component(url, qc):
"""
add component qc to the querystring of url
"""
2020-02-12 22:57:35 +00:00
m = list(urlparse(url))
2020-03-27 18:23:43 +00:00
if m[4]:
m[4] = "&".join([m[4], qc])
else:
m[4] = qc
2020-02-12 22:57:35 +00:00
return urlunparse(m)
def isbn_node(isbn):
2020-03-27 17:48:18 +00:00
node = soup.new_tag("dcterms:identifier")
node.attrs.update({"xsi:type":'dcterms:URI'})
node.string = 'urn:ISBN:'+ isbn
return node
def work_node(work, facet=None):
2020-03-27 18:23:43 +00:00
2020-03-27 17:48:18 +00:00
node = soup.new_tag("entry")
# title
node.append(text_node("title", work.title))
2020-03-27 18:23:43 +00:00
# id
2020-03-27 18:23:43 +00:00
node.append(text_node(
'id',
"{base}{url}".format(
base=UNGLUEIT_URL,
url=reverse('work_identifier', kwargs={'work_id': work.id})
)
))
2016-10-12 15:01:30 +00:00
updated = None
2020-03-27 18:23:43 +00:00
# links for all ebooks
2020-03-27 18:23:43 +00:00
ebooks = facet.filter_model("Ebook", work.ebooks()) if facet else work.ebooks()
2016-08-24 19:43:28 +00:00
versions = set()
for ebook in ebooks:
if updated is None:
2016-10-12 15:01:30 +00:00
# most recent ebook, first ebook in loop
updated = ebook.created.isoformat()
node.append(text_node('updated', updated))
2016-08-24 19:43:28 +00:00
if not ebook.version_label in versions:
versions.add(ebook.version_label)
2020-03-27 17:48:18 +00:00
link_node = soup.new_tag("link")
2020-03-27 18:23:43 +00:00
2016-08-24 19:43:28 +00:00
# ebook.download_url is an absolute URL with the protocol, domain, and path baked in
2020-03-27 18:23:43 +00:00
link_rel = "http://opds-spec.org/acquisition/open-access"
link_node.attrs.update({
"href":add_query_component(ebook.download_url, "feed=opds"),
"rel":link_rel,
"dcterms:rights": str(ebook.rights)
})
if ebook.is_direct():
2020-03-27 17:48:18 +00:00
link_node["type"] = FORMAT_TO_MIMETYPE.get(ebook.format, "")
2016-08-24 19:43:28 +00:00
else:
2020-03-27 18:23:43 +00:00
# indirect acquisition, i.e. google books
2020-03-27 17:48:18 +00:00
link_node["type"] = "text/html"
indirect = soup.new_tag("opds:indirectAcquisition",)
indirect["type"] = FORMAT_TO_MIMETYPE.get(ebook.format, "")
2016-08-24 19:43:28 +00:00
link_node.append(indirect)
if ebook.version_label:
2020-03-27 17:48:18 +00:00
link_node.attrs.update({"schema:version": ebook.version_label})
2016-08-24 19:43:28 +00:00
node.append(link_node)
2020-03-27 18:23:43 +00:00
# get the cover -- assume jpg?
2020-03-27 18:23:43 +00:00
2020-03-27 17:48:18 +00:00
cover_node = soup.new_tag("link")
2020-03-27 18:23:43 +00:00
cover_node.attrs.update({
2020-04-02 00:24:36 +00:00
"href": work.cover_image_small(),
"type": "image/" + work.cover_filetype(),
"rel": "http://opds-spec.org/image/thumbnail"
2020-03-27 18:23:43 +00:00
})
node.append(cover_node)
2020-03-27 17:48:18 +00:00
cover_node = soup.new_tag("link")
2020-03-27 18:23:43 +00:00
cover_node.attrs.update({
2020-04-02 00:24:36 +00:00
"href": work.cover_image_thumbnail(),
"type": "image/" + work.cover_filetype(),
"rel": "http://opds-spec.org/image"
2020-03-27 18:23:43 +00:00
})
node.append(cover_node)
2020-03-27 18:23:43 +00:00
# <dcterms:issued>2012</dcterms:issued>
2020-03-27 17:48:18 +00:00
node.append(text_node("dcterms:issued", work.publication_date))
2020-03-27 18:23:43 +00:00
# author
# TO DO: include all authors?
2020-03-27 17:48:18 +00:00
author_node = soup.new_tag("author")
author_node.append(text_node("name", work.author()))
node.append(author_node)
2020-03-27 18:23:43 +00:00
# publisher
#<dcterms:publisher>Open Book Publishers</dcterms:publisher>
2020-03-27 18:23:43 +00:00
if work.publishers().count():
for publisher in work.publishers():
2020-03-27 17:48:18 +00:00
node.append(text_node("dcterms:publisher", publisher.name.name))
2020-03-27 18:23:43 +00:00
# language
#<dcterms:language>en</dcterms:language>
2020-03-27 17:48:18 +00:00
node.append(text_node("dcterms:language", work.language))
2020-03-27 18:23:43 +00:00
2015-01-14 23:04:17 +00:00
# description
2020-03-27 17:48:18 +00:00
node.append(html_node("content", work.description))
2020-03-27 18:23:43 +00:00
# identifiers
if work.identifiers.filter(type='isbn'):
for isbn in work.identifiers.filter(type='isbn')[0:9]: #10 should be more than enough
node.append(isbn_node(isbn.value))
2020-03-27 18:23:43 +00:00
# subject tags
# [[subject.name for subject in work.subjects.all()] for work in ccworks if work.subjects.all()]
for subject in work.subjects.all():
if subject.is_visible:
2020-03-27 17:48:18 +00:00
category_node = soup.new_tag("category")
try:
2020-03-27 18:23:43 +00:00
category_node["term"] = subject.name
node.append(category_node)
try:
subject.works.filter(is_free=True)[1]
# only show feed if there's another work in it
2020-03-27 18:23:43 +00:00
node.append(navlink('related', 'kw.' + subject.name, 0,
'popular', title=subject.name))
2015-01-22 20:27:41 +00:00
except:
pass
except ValueError:
# caused by control chars in subject.name
logger.warning('Deleting subject: %s' % subject.name)
subject.delete()
2016-08-15 17:47:00 +00:00
# age level
2020-03-27 18:23:43 +00:00
# <category term="15-18" scheme="http://schema.org/typicalAgeRange"
# label="Teen - Grade 10-12, Age 15-18"/>
2016-08-15 17:47:00 +00:00
if work.age_level:
2020-03-27 17:48:18 +00:00
category_node = soup.new_tag("category")
2020-03-27 18:23:43 +00:00
category_node["scheme"] = 'http://schema.org/typicalAgeRange'
category_node["term"] = work.age_level
2020-03-27 17:48:18 +00:00
category_node["label"] = work.get_age_level_display()
2016-08-15 17:47:00 +00:00
node.append(category_node)
2020-03-27 18:23:43 +00:00
# rating
2020-03-27 17:48:18 +00:00
rating_node = soup.new_tag("schema:Rating")
rating_node.attrs.update({"schema:ratingValue":"{:}".format(work.priority())})
2016-01-19 22:28:45 +00:00
node.append(rating_node)
return node
2014-07-17 04:00:16 +00:00
class Facet:
title = ''
works = None
feed_path = ''
2014-12-06 01:38:08 +00:00
description = ''
2020-03-27 18:23:43 +00:00
2014-12-06 20:00:23 +00:00
def feed(self, page=None, order_by='newest'):
self.works = self.works.order_by(*facets.get_order_by(order_by))
return opds_feed_for_works(self, page=page, order_by=order_by)
2020-03-27 18:23:43 +00:00
2014-07-17 04:34:27 +00:00
def updated(self):
# return the creation date for most recently added item
if not self.works:
return pytz.utc.localize(datetime.datetime.utcnow()).isoformat()
2020-03-27 18:23:43 +00:00
return pytz.utc.localize(self.works[0].created).isoformat()
2014-12-05 23:38:04 +00:00
def get_facet_facet(facet_path):
class Facet_Facet(Facet):
2019-07-31 18:19:20 +00:00
2014-12-05 23:38:04 +00:00
def __init__(self, facet_path=facet_path):
self.feed_path = facet_path
self.facet_object = facets.get_facet_object(facet_path)
self.title = "Unglue.it"
for facet in self.facet_object.facets():
self.title = self.title + " " + facet.title
self.works = self.facet_object.get_query_set().distinct()
2014-12-06 01:38:08 +00:00
self.description = self.facet_object.description
2014-12-05 23:38:04 +00:00
return Facet_Facet
2014-07-17 04:00:16 +00:00
class creative_commons(Facet):
def __init__(self):
self.title = "Unglue.it Catalog: Creative Commons Books"
self.feed_path = "creative_commons"
2020-03-27 18:23:43 +00:00
self.works = models.Work.objects.filter(
editions__ebooks__isnull=False,
editions__ebooks__rights__in=cc.LICENSE_LIST
).distinct()
self.description = """These Creative Commons licensed ebooks are free to read - the people
who created them want you to read and share them."""
self.facet_object = facets.get_facet_object(self.feed_path)
2020-03-27 18:23:43 +00:00
2014-07-17 04:00:16 +00:00
class active_campaigns(Facet):
"""
return opds feed for works associated with active campaigns
"""
def __init__(self):
self.title = "Unglue.it Catalog: Books under Active Campaign"
self.feed_path = "active_campaigns"
2020-03-27 18:23:43 +00:00
self.works = models.Work.objects.filter(campaigns__status='ACTIVE', is_free=True)
self.description = """With your help we're raising money
to make these books free to the world."""
self.facet_object = facets.get_facet_object(self.feed_path)
def opds_feed_for_work(work_id):
class single_work_facet:
def __init__(self, work_id):
try:
2020-03-27 18:23:43 +00:00
works = models.Work.objects.filter(id=work_id)
except models.Work.DoesNotExist:
2020-03-27 18:23:43 +00:00
works = models.Work.objects.none()
2016-08-24 19:43:28 +00:00
except ValueError:
# not a valid work_id
2020-03-27 18:23:43 +00:00
works = models.Work.objects.none()
self.works = works
self.title = 'Unglue.it work #%s' % work_id
self.feed_path = ''
self.facet_object = facets.BaseFacet(None)
return opds_feed_for_works(single_work_facet(work_id))
2014-12-06 20:00:23 +00:00
def opds_feed_for_works(the_facet, page=None, order_by='newest'):
2020-04-02 18:09:34 +00:00
global soup
if not soup:
soup = BeautifulSoup('', 'lxml')
2014-12-06 20:00:23 +00:00
works = the_facet.works
feed_path = the_facet.feed_path
title = the_facet.title
2020-04-02 00:24:36 +00:00
feed_header = """<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:opds="http://opds-spec.org/"
xmlns="http://www.w3.org/2005/Atom"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2016-01-21 23:03:31 +00:00
xmlns:schema="http://schema.org/"
xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"
2020-03-27 18:23:43 +00:00
xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd
2020-03-27 17:48:18 +00:00
http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd">
"""
2020-03-27 18:23:43 +00:00
2020-03-27 17:48:18 +00:00
yield feed_header
2020-03-27 18:23:43 +00:00
# add title
# TO DO: will need to calculate the number items and where in the feed we are
2020-03-27 18:23:43 +00:00
2020-03-27 17:48:18 +00:00
yield text_node('title', title + ' - sorted by ' + order_by).prettify()
2020-03-27 18:23:43 +00:00
# id
2020-03-27 17:48:18 +00:00
feed = text_node(
'id',
"{url}/api/opds/{feed_path}/?order_by={order_by}".format(
url=UNGLUEIT_URL,
feed_path=urlquote(feed_path),
order_by=order_by,
),
)
yield feed.prettify()
# updated
# TO DO: fix time zone?
# also use our wrapped datetime code
2020-03-27 18:23:43 +00:00
2020-03-27 17:48:18 +00:00
feed = text_node('updated', pytz.utc.localize(datetime.datetime.utcnow()).isoformat())
yield feed.prettify()
2020-03-27 18:23:43 +00:00
# author
2020-03-27 18:23:43 +00:00
2020-03-27 17:48:18 +00:00
author_node = soup.new_tag("author")
author_node.append(text_node('name', 'unglue.it'))
2014-07-17 04:33:43 +00:00
author_node.append(text_node('uri', UNGLUEIT_URL))
2020-03-27 17:48:18 +00:00
yield author_node.prettify()
2020-03-27 18:23:43 +00:00
# links: start, self, next/prev (depending what's necessary -- to start with put all CC books)
2020-03-27 18:23:43 +00:00
# start link
2020-03-27 18:23:43 +00:00
yield navlink('start', feed_path, None, order_by, title="First 10").prettify()
2014-11-04 00:36:26 +00:00
# next link
2020-03-27 18:23:43 +00:00
2014-11-04 00:36:26 +00:00
if not page:
2020-03-27 17:48:18 +00:00
page = 0
2014-11-04 00:36:26 +00:00
else:
try:
2020-03-27 17:48:18 +00:00
page = int(page)
2014-11-04 00:36:26 +00:00
except TypeError:
2020-03-27 17:48:18 +00:00
page = 0
2020-03-27 18:23:43 +00:00
2014-11-04 00:36:26 +00:00
try:
works[10 * page + 10]
2020-03-27 17:48:18 +00:00
yield navlink('next', feed_path, page+1, order_by, title="Next 10").prettify()
2014-11-04 00:36:26 +00:00
except IndexError:
pass
2020-03-27 18:23:43 +00:00
2014-12-06 20:00:23 +00:00
# sort facets
2020-03-27 17:48:18 +00:00
yield navlink(FACET_RELATION, feed_path, None, 'popular', group="Order",
2020-03-27 18:23:43 +00:00
active=order_by == 'popular', title="Sorted by popularity").prettify()
2020-03-27 17:48:18 +00:00
yield navlink(FACET_RELATION, feed_path, None, 'newest', group="Order",
2020-03-27 18:23:43 +00:00
active=order_by == 'newest', title="Sorted by newest").prettify()
2014-12-06 20:00:23 +00:00
#other facets
if feed_path not in old_facets:
for other_group in the_facet.facet_object.get_other_groups():
for facet_object in other_group.get_facets():
2020-03-27 17:48:18 +00:00
yield navlink(FACET_RELATION, feed_path + '/' + facet_object.facet_name,
2020-03-27 18:23:43 +00:00
None, order_by, group=other_group.title,
title=facet_object.title).prettify()
works = islice(works, 10 * page, 10 * page + 10)
2014-11-04 00:36:26 +00:00
if page > 0:
2020-03-27 17:48:18 +00:00
yield navlink('previous', feed_path, page-1, order_by, title="Previous 10").prettify()
2014-11-04 00:36:26 +00:00
for work in works:
2020-03-27 17:48:18 +00:00
yield work_node(work, facet=the_facet.facet_object).prettify()
2014-11-04 00:36:26 +00:00
2020-03-27 18:23:43 +00:00
yield '</feed>'
2020-03-27 17:48:18 +00:00
2020-03-27 18:23:43 +00:00
def navlink(rel, path, page, order_by, group=None, active=None, title=""):
2020-03-27 17:48:18 +00:00
link = soup.new_tag("link")
2020-03-27 18:23:43 +00:00
link.attrs.update({
"rel":rel,
"href": UNGLUEIT_URL + "/api/opds/" + urlquote(path) + '/?order_by=' + order_by + (
'&page=' + str(page) if page is not None else ''
),
"type": ACQUISITION,
"title": title,
})
2014-12-06 20:00:23 +00:00
if rel == FACET_RELATION:
if group:
2020-03-27 17:48:18 +00:00
link['opds:facetGroup'] = group
2014-12-06 20:00:23 +00:00
if active:
2020-03-27 17:48:18 +00:00
link['opds:activeFacet'] = 'true'
return link