Unglue.it Catalog

# coding: utf-8 # # # Let me see some examples of OPDS in the wild to see how it works: # # available feeds: https://code.google.com/p/openpub/wiki/AvailableFeeds # # let's look at archive.org, which presumably should have a good feed # # * archive.org: http://bookserver.archive.org/catalog/ # * feedbooks.com: http://www.feedbooks.com/catalog.atom # * oreilly.com: http://opds.oreilly.com/opds/ # ## Some concepts # http://www.slideshare.net/fullscreen/HadrienGardeur/understanding-opds/7 # # OPDS is based on # # * resources # * collections # # A collection aggregates resources. # # Two kinds of resources: # # * Navigation link # * Catalog entry # # for two kinds of collections: # # * Navigation # * Acquisition ### Acquisition scenarios # Multiple acquisition scenarios: # # * Open Access # * Sale # * Lending # * Subscription # * Extract # * Undefined # In[ ]: import requests from lxml.etree import fromstring ATOM_NS = "http://www.w3.org/2005/Atom" def nsq(url, tag): return "{" + url +"}" + tag url = "http://bookserver.archive.org/catalog/" r = requests.get(url) # In[ ]: doc=fromstring(r.text) doc # In[ ]: # get links # what types specified in spec? [link.attrib for link in doc.findall(nsq(ATOM_NS,'link'))] # it might be useful to use specialized libraries to handle Atom or AtomPub. # In[ ]: doc.findall(nsq(ATOM_NS, "entry")) ## Atom feed generation # https://github.com/sramana/pyatom # # pip install pyatom # In[ ]: # let's try the basics of pyatom # puzzled wwhere come from. from pyatom import AtomFeed import datetime feed = AtomFeed(title="Unglue.it", subtitle="Unglue.it OPDS Navigation", feed_url="https://unglue.it/opds", url="https://unglue.it/", author="unglue.it") # Do this for each feed entry feed.add(title="My Post", content="Body of my post", content_type="html", author="Me", url="http://example.org/entry1", updated=datetime.datetime.utcnow()) print feed.to_string() ## Creating navigation feed # template: https://gist.github.com/rdhyee/94d82f6639809fb7796f#file-unglueit_nav_opds-xml # # ````xml # # Unglue.it Catalog # https://unglue.it/opds # 2014-06-13T21:48:34Z # # unglue.it # https://unglue.it # # # # # # Creative Commons # https://unglue.it/creativecommons/ # 2014-06-13T00:00:00Z # # These Creative Commons licensed ebooks are ready to read - the people who created them want you to read and share them.. # # # Active Campaigns # https://unglue.it/campaigns/ending#2 # 2014-06-13T00:00:00Z # # With your help we're raising money to make these books free to the world. # # ```` # In[ ]: from lxml import etree import datetime import pytz def text_node(tag, text): node = etree.Element(tag) node.text = text return node def entry_node(title, id_, updated, link_href, link_type, content): node = etree.Element("entry") node.append(text_node("title", title)) node.append(text_node("id", id_)) node.append(text_node("updated", updated)) link_node = etree.Element("link") link_node.attrib.update({'href':link_href, 'type':link_type}) node.append(link_node) node.append(text_node("content", content)) return node feed_xml = """""" feed = etree.fromstring(feed_xml) # add title feed.append(text_node('title', "Unglue.it Catalog")) # id feed.append(text_node('id', "https://unglue.it/opds")) # updated feed.append(text_node('updated', pytz.utc.localize(datetime.datetime.utcnow()).isoformat())) # author author_node = etree.Element("author") author_node.append(text_node('name', 'unglue.it')) author_node.append(text_node('uri', 'https://unglue.it')) feed.append(author_node) # start link start_link = etree.Element("link") start_link.attrib.update({"rel":"start", "href":"https://unglue.it/opds", "type":"application/atom+xml;profile=opds-catalog;kind=navigation", }) feed.append(start_link) # crawlable link crawlable_link = etree.Element("link") crawlable_link.attrib.update({"rel":"http://opds-spec.org/crawlable", "href":"https://unglue.it/opds/crawlable", "type":"application/atom+xml;profile=opds-catalog;kind=acquisition", "title":"Crawlable feed"}) feed.append(crawlable_link) # CC entry_node cc_entry = entry_node(title="Creative Commons", id_="https://unglue.it/creativecommons/", updated="2014-06-13T00:00:00Z", link_href="creativecommons.xml", link_type="application/atom+xml;profile=opds-catalog;kind=acquisition", content="These Creative Commons licensed ebooks are ready to read - the people who created them want you to read and share them..") feed.append(cc_entry) print etree.tostring(feed, pretty_print=True) ## Writing Crawlable Feed # ````xml # # Unglue.it Catalog -- 1 to 1 of 2000 -- crawlable feed # https://unglue.it/opds/crawlable # 2014-06-16T00:00:00Z # # # # unglue.it # https://unglue.it # # # # Oral Literature In Africa # https://unglue.it/work/81834/ # 2013-07-17T23:27:37Z # # # # # 2012 # # Ruth Finnegan # # # # # Open Book Publishers # en # # # # ```` # In[ ]: # crawlable feed from itertools import islice from lxml import etree import datetime import urlparse import pytz from regluit.core import models import regluit.core.cc as cc licenses = cc.LICENSE_LIST FORMAT_TO_MIMETYPE = {'pdf':"application/pdf", 'epub':"application/epub+zip", 'mobi':"application/x-mobipocket-ebook", 'html':"text/html", 'text':"text/html"} def text_node(tag, text): node = etree.Element(tag) node.text = text return node def map_to_unglueit(url): m = list(urlparse.urlparse(url)) (m[0], m[1]) = ('https','unglue.it') return urlparse.urlunparse(m) def work_node(work): node = etree.Element("entry") # title node.append(text_node("title", work.title)) # id node.append(text_node('id', "https://unglue.it{0}".format(work.get_absolute_url()))) # updated -- using creation date node.append(text_node('updated', work.created.isoformat())) # links for all ebooks for ebook in work.ebooks(): link_node = etree.Element("link") link_node.attrib.update({"href":map_to_unglueit(ebook.download_url), "type":FORMAT_TO_MIMETYPE.get(ebook.format, ""), "rel":"http://opds-spec.org/acquisition"}) node.append(link_node) # get the cover -- assume jpg? cover_node = etree.Element("link") cover_node.attrib.update({"href":work.cover_image_small(), "type":"image/jpeg", "rel":"http://opds-spec.org/image/thumbnail"}) node.append(cover_node) # 2012 node.append(text_node("{http://purl.org/dc/terms/}issued", work.publication_date_year)) # author # TO DO: include all authors? author_node = etree.Element("author") author_node.append(text_node("name", work.author())) node.append(author_node) # publisher #Open Book Publishers if len(work.publishers()): for publisher in work.publishers(): node.append(text_node("{http://purl.org/dc/terms/}issued", publisher.name.name)) # language #en node.append(text_node("{http://purl.org/dc/terms/}language", work.language)) # subject tags # [[subject.name for subject in work.subjects.all()] for work in ccworks if work.subjects.all()] if work.subjects.all(): for subject in work.subjects.all(): category_node = etree.Element("category") category_node.attrib["term"] = subject.name node.append(category_node) return node feed_xml = """""" feed = etree.fromstring(feed_xml) # add title # TO DO: will need to calculate the number items and where in the feed we are feed.append(text_node('title', "Unglue.it Catalog: crawlable feed")) # id feed.append(text_node('id', "https://unglue.it/opds/crawlable")) # updated # TO DO: fix time zone? feed.append(text_node('updated', pytz.utc.localize(datetime.datetime.utcnow()).isoformat())) # author author_node = etree.Element("author") author_node.append(text_node('name', 'unglue.it')) author_node.append(text_node('uri', 'https://unglue.it')) feed.append(author_node) # links: start, self, next/prev (depending what's necessary -- to start with put all CC books) # start link start_link = etree.Element("link") start_link.attrib.update({"rel":"start", "href":"https://unglue.it/opds", "type":"application/atom+xml;profile=opds-catalog;kind=navigation", }) feed.append(start_link) # self link self_link = etree.Element("link") self_link.attrib.update({"rel":"self", "href":"https://unglue.it/opds/crawlable", "type":"application/atom+xml;profile=opds-catalog;kind=acquisition", }) feed.append(self_link) licenses = cc.LICENSE_LIST ccworks = models.Work.objects.filter(editions__ebooks__isnull=False, editions__ebooks__rights__in=licenses).distinct().order_by('-created') for work in islice(ccworks,None): node = work_node(work) feed.append(node) print etree.tostring(feed, pretty_print=True) # In[ ]: # how to get CC books? # make use of CCListView: https://github.com/Gluejar/regluit/blob/b675052736f79dcb8d84ddc6349c99fa392fa9bc/frontend/views.py#L878 # template: https://github.com/Gluejar/regluit/blob/b675052736f79dcb8d84ddc6349c99fa392fa9bc/frontend/templates/cc_list.html from regluit.core import models import regluit.core.cc as cc licenses = cc.LICENSE_LIST ccworks = models.Work.objects.filter(editions__ebooks__isnull=False, editions__ebooks__rights__in=licenses).distinct().order_by('-created') ccworks # In[ ]: dir(ccworks[0]) # In[ ]: work = ccworks[0] ebook = work.ebooks()[0] dir(ebook) # In[ ]: from collections import Counter c = Counter() for work in islice(ccworks,None): c.update([ebook.format for ebook in work.ebooks()]) print c #[[ebook.format for ebook in work.ebooks()] for work in islice(ccworks,1)] ## Appendix: dealing with namespaces in ElementTree # Maybe come back to http://effbot.org/zone/element-namespaces.htm for more sophisticated ways to register namespaces.