regluit/notebooks/opds.py


# coding: utf-8

# 
# 
# Let me see some examples of OPDS in the wild to see how it works:
# 
# available feeds: https://code.google.com/p/openpub/wiki/AvailableFeeds
# 
# let's look at archive.org, which presumably should have a good feed
# 
# * archive.org: http://bookserver.archive.org/catalog/
# * feedbooks.com: http://www.feedbooks.com/catalog.atom
# * oreilly.com: http://opds.oreilly.com/opds/
# 

## Some concepts

# http://www.slideshare.net/fullscreen/HadrienGardeur/understanding-opds/7
# 
# OPDS is based on
# 
# * resources
# * collections 
# 
# A collection aggregates resources.
# 
# Two kinds of resources:
# 
# * Navigation link 
# * Catalog entry 
# 
# for two kinds of collections:
# 
# * Navigation 
# * Acquisition

### Acquisition scenarios

# Multiple acquisition scenarios:
#     
# * Open Access
# * Sale
# * Lending
# * Subscription
# * Extract
# * Undefined

# In[ ]:

import requests
from lxml.etree import fromstring

ATOM_NS = "http://www.w3.org/2005/Atom"

def nsq(url, tag):
    return "{" + url +"}" + tag

url = "http://bookserver.archive.org/catalog/"
    
r = requests.get(url)


# In[ ]:

doc=fromstring(r.text)
doc


# In[ ]:

# get links
# what types specified in spec?

[link.attrib for link in doc.findall(nsq(ATOM_NS,'link'))]


# it might be useful to use specialized libraries to handle Atom or AtomPub.

# In[ ]:

doc.findall(nsq(ATOM_NS, "entry"))


## Atom feed generation

# https://github.com/sramana/pyatom
# 
#     pip install pyatom

# In[ ]:

# let's try the basics of pyatom
# puzzled wwhere <links> come from.

from pyatom import AtomFeed
import datetime

feed = AtomFeed(title="Unglue.it",
                subtitle="Unglue.it OPDS Navigation",
                feed_url="https://unglue.it/opds",
                url="https://unglue.it/",
                author="unglue.it")

# Do this for each feed entry
feed.add(title="My Post",
         content="Body of my post",
         content_type="html",
         author="Me",
         url="http://example.org/entry1",
         updated=datetime.datetime.utcnow())

print feed.to_string()


## Creating navigation feed

# template: https://gist.github.com/rdhyee/94d82f6639809fb7796f#file-unglueit_nav_opds-xml

# 
# ````xml
# <feed xmlns:dcterms="http://purl.org/dc/terms/" xmlns:opds="http://opds-spec.org/"
#   xmlns="http://www.w3.org/2005/Atom"
#   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
#   xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml">
#   <title>Unglue.it Catalog</title>
#   <id>https://unglue.it/opds</id>
#   <updated>2014-06-13T21:48:34Z</updated>
#   <author>
#     <name>unglue.it</name>
#     <uri>https://unglue.it</uri>
#   </author>
#   <!-- crawlable link in archive.org (optional for unglue.it) -->
#   <link rel="http://opds-spec.org/crawlable" type="application/atom+xml;profile=opds-catalog;kind=acquisition" href="https://unglue.it/opds/crawlable" title="Crawlable feed"/>
#   <link rel="start" href="https://unglue.it/opds" type="application/atom+xml;profile=opds-catalog;kind=navigation" />
#   <entry>
#     <title>Creative Commons</title>
#     <id>https://unglue.it/creativecommons/</id>
#     <updated>2014-06-13T00:00:00Z</updated>
#     <link href="creativecommons.xml" type="application/atom+xml;profile=opds-catalog;kind=acquisition" />
#     <content>These Creative Commons licensed ebooks are ready to read - the people who created them want you to read and share them..</content>
#   </entry>
#   <entry>
#     <title>Active Campaigns</title>
#     <id>https://unglue.it/campaigns/ending#2</id>
#     <updated>2014-06-13T00:00:00Z</updated>
#     <link href="active_campaigns.xml" type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>
#     <content>With your help we're raising money to make these books free to the world.</content>
#   </entry>
# </feed>````

# In[ ]:

from lxml import etree
import datetime
import pytz

def text_node(tag, text):
    node = etree.Element(tag)
    node.text = text
    return node

def entry_node(title, id_, updated, link_href, link_type, content):
    node = etree.Element("entry")
    node.append(text_node("title", title))
    node.append(text_node("id", id_))
    node.append(text_node("updated", updated))
    
    link_node = etree.Element("link")
    link_node.attrib.update({'href':link_href, 'type':link_type})
    node.append(link_node)
    
    node.append(text_node("content", content))
    return node

feed_xml = """<feed xmlns:dcterms="http://purl.org/dc/terms/" 
  xmlns:opds="http://opds-spec.org/"
  xmlns="http://www.w3.org/2005/Atom"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"
  xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd"/>"""

feed = etree.fromstring(feed_xml)

# add title

feed.append(text_node('title', "Unglue.it Catalog"))

# id 

feed.append(text_node('id', "https://unglue.it/opds"))

# updated

feed.append(text_node('updated',
                      pytz.utc.localize(datetime.datetime.utcnow()).isoformat()))

# author

author_node = etree.Element("author")
author_node.append(text_node('name', 'unglue.it'))
author_node.append(text_node('uri', 'https://unglue.it'))
feed.append(author_node)

# start link

start_link = etree.Element("link")
start_link.attrib.update({"rel":"start",
 "href":"https://unglue.it/opds",
 "type":"application/atom+xml;profile=opds-catalog;kind=navigation",
})
feed.append(start_link)

# crawlable link

crawlable_link = etree.Element("link")
crawlable_link.attrib.update({"rel":"http://opds-spec.org/crawlable", 
 "href":"https://unglue.it/opds/crawlable",
 "type":"application/atom+xml;profile=opds-catalog;kind=acquisition",
  "title":"Crawlable feed"})
feed.append(crawlable_link)

# CC entry_node

cc_entry = entry_node(title="Creative Commons",
                      id_="https://unglue.it/creativecommons/",
                      updated="2014-06-13T00:00:00Z",
                      link_href="creativecommons.xml",
                      link_type="application/atom+xml;profile=opds-catalog;kind=acquisition",
                      content="These Creative Commons licensed ebooks are ready to read - the people who created them want you to read and share them..")
feed.append(cc_entry)

print etree.tostring(feed, pretty_print=True)


## Writing Crawlable Feed

# ````xml
# <feed xmlns:dcterms="http://purl.org/dc/terms/" xmlns:opds="http://opds-spec.org/"
#   xmlns="http://www.w3.org/2005/Atom"
#   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
#   xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"
#   xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd">  
#   <title>Unglue.it Catalog -- 1 to 1 of 2000 -- crawlable feed</title>
#   <id>https://unglue.it/opds/crawlable</id>
#   <updated>2014-06-16T00:00:00Z</updated>
#   <link rel="start" href="https://unglue.it/opds" type="application/atom+xml;profile=opds-catalog;kind=navigation" />
#   <link rel="self" type="application/atom+xml;profile=opds-catalog;kind=acquisition" href="https://unglue.it/opds/crawlable"/>
#   <author>
#     <name>unglue.it</name>
#     <uri>https://unglue.it</uri>
#   </author>
#   <link rel="next" type="application/atom+xml;profile=opds-catalog;kind=acquisition" href="/opds/crawlable/1" title="Next results"/>
#   <entry>
#     <title>Oral Literature In Africa</title>
#     <id>https://unglue.it/work/81834/</id>
#     <updated>2013-07-17T23:27:37Z</updated>
#     <link href="https://unglue.it/download_ebook/904/" type="application/pdf" rel="http://opds-spec.org/acquisition"/>
#     <link href="https://unglue.it/download_ebook/905/" type="application/epub+zip" rel="http://opds-spec.org/acquisition"/>
#     <link href="https://unglue.it/download_ebook/906/" type="application/x-mobipocket-ebook" rel="http://opds-spec.org/acquisition"/>
#     <link href="https://unglueit.files.wordpress.com/2012/05/olacover_thumbnail.jpg" type="image/jpeg" rel="http://opds-spec.org/image/thumbnail"/>
#     <dcterms:issued>2012</dcterms:issued>
#     <author>
#       <name>Ruth Finnegan</name>
#     </author>
#     <category term="Africa"/>
#     <category term="African Folk literature"/>
#     <category term="Folk literature"/>
#     <dcterms:publisher>Open Book Publishers</dcterms:publisher>
#     <dcterms:language>en</dcterms:language>
#     <content type="html"></content>
#   </entry>
# </feed>
# ````

# In[ ]:

# crawlable feed

from itertools import islice

from lxml import etree
import datetime
import urlparse

import pytz

from regluit.core import models
import regluit.core.cc as cc

licenses = cc.LICENSE_LIST

FORMAT_TO_MIMETYPE = {'pdf':"application/pdf",
                      'epub':"application/epub+zip",
                      'mobi':"application/x-mobipocket-ebook",
                      'html':"text/html",
                      'text':"text/html"}

def text_node(tag, text):
    node = etree.Element(tag)
    node.text = text
    return node

def map_to_unglueit(url):
    m = list(urlparse.urlparse(url))
    (m[0], m[1]) = ('https','unglue.it')
    return urlparse.urlunparse(m)

def work_node(work):
    node = etree.Element("entry")
    # title
    node.append(text_node("title", work.title))
    
    # id
    node.append(text_node('id', "https://unglue.it{0}".format(work.get_absolute_url())))
    
    # updated -- using creation date
    node.append(text_node('updated', work.created.isoformat()))
    
    # links for all ebooks
    
    for ebook in work.ebooks():
        link_node = etree.Element("link")
        link_node.attrib.update({"href":map_to_unglueit(ebook.download_url),
                                 "type":FORMAT_TO_MIMETYPE.get(ebook.format, ""),
                                 "rel":"http://opds-spec.org/acquisition"})
        node.append(link_node)
        
    # get the cover -- assume jpg?
    
    cover_node = etree.Element("link")
    cover_node.attrib.update({"href":work.cover_image_small(),
                              "type":"image/jpeg",
                              "rel":"http://opds-spec.org/image/thumbnail"})
    node.append(cover_node)
    
    # <dcterms:issued>2012</dcterms:issued>
    node.append(text_node("{http://purl.org/dc/terms/}issued", work.publication_date))
    
    # author
    # TO DO: include all authors?
    author_node = etree.Element("author")
    author_node.append(text_node("name", work.author()))
    node.append(author_node)
    
    # publisher
    #<dcterms:publisher>Open Book Publishers</dcterms:publisher>
    if len(work.publishers()):
        for publisher in work.publishers():
            node.append(text_node("{http://purl.org/dc/terms/}issued", publisher.name.name))
            
    # language
    #<dcterms:language>en</dcterms:language>
    node.append(text_node("{http://purl.org/dc/terms/}language", work.language))

    # subject tags
    # [[subject.name for subject in work.subjects.all()] for work in ccworks if work.subjects.all()]
    if work.subjects.all():
        for subject in work.subjects.all():
            category_node = etree.Element("category")
            category_node.attrib["term"] = subject.name 
            node.append(category_node)
            
    return node

feed_xml = """<feed xmlns:dcterms="http://purl.org/dc/terms/" 
  xmlns:opds="http://opds-spec.org/"
  xmlns="http://www.w3.org/2005/Atom"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"
  xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd"/>"""

feed = etree.fromstring(feed_xml)

# add title
# TO DO: will need to calculate the number items and where in the feed we are

feed.append(text_node('title', "Unglue.it Catalog: crawlable feed"))

# id 

feed.append(text_node('id', "https://unglue.it/opds/crawlable"))

# updated
# TO DO:  fix time zone?

feed.append(text_node('updated',
                      pytz.utc.localize(datetime.datetime.utcnow()).isoformat()))

# author

author_node = etree.Element("author")
author_node.append(text_node('name', 'unglue.it'))
author_node.append(text_node('uri', 'https://unglue.it'))
feed.append(author_node)

# links:  start, self, next/prev (depending what's necessary -- to start with put all CC books)

# start link

start_link = etree.Element("link")
start_link.attrib.update({"rel":"start",
 "href":"https://unglue.it/opds",
 "type":"application/atom+xml;profile=opds-catalog;kind=navigation",
})
feed.append(start_link)

# self link

self_link = etree.Element("link")
self_link.attrib.update({"rel":"self",
 "href":"https://unglue.it/opds/crawlable",
 "type":"application/atom+xml;profile=opds-catalog;kind=acquisition",
})
feed.append(self_link)

licenses = cc.LICENSE_LIST

ccworks = models.Work.objects.filter(editions__ebooks__isnull=False, 
                    editions__ebooks__rights__in=licenses).distinct().order_by('-created')

for work in islice(ccworks,None):
    node = work_node(work)
    feed.append(node)

print etree.tostring(feed, pretty_print=True)


# In[ ]:

# how to get CC books?
# make use of CCListView: https://github.com/Gluejar/regluit/blob/b675052736f79dcb8d84ddc6349c99fa392fa9bc/frontend/views.py#L878
# template: https://github.com/Gluejar/regluit/blob/b675052736f79dcb8d84ddc6349c99fa392fa9bc/frontend/templates/cc_list.html

from regluit.core import models
import regluit.core.cc as cc

licenses = cc.LICENSE_LIST

ccworks = models.Work.objects.filter(editions__ebooks__isnull=False, 
                    editions__ebooks__rights__in=licenses).distinct().order_by('-created')
ccworks


# In[ ]:

dir(ccworks[0])


# In[ ]:

work = ccworks[0]
ebook = work.ebooks()[0]
dir(ebook)


# In[ ]:

from collections import Counter

c = Counter()

for work in islice(ccworks,None):
    c.update([ebook.format for ebook in work.ebooks()])
    
print c

#[[ebook.format for ebook in work.ebooks()] for work in islice(ccworks,1)]


## Appendix:  dealing with namespaces in ElementTree

# Maybe come back to http://effbot.org/zone/element-namespaces.htm for more sophisticated ways to register namespaces.
Now able to create static versions of the navigation and acquisition feeds for all CC books. 2014-06-17 21:48:00 +00:00
			`# coding: utf-8`

			`#`
			`#`
			`# Let me see some examples of OPDS in the wild to see how it works:`
			`#`
			`# available feeds: https://code.google.com/p/openpub/wiki/AvailableFeeds`
			`#`
			`# let's look at archive.org, which presumably should have a good feed`
			`#`
			`# * archive.org: http://bookserver.archive.org/catalog/`
			`# * feedbooks.com: http://www.feedbooks.com/catalog.atom`
			`# * oreilly.com: http://opds.oreilly.com/opds/`
			`#`

			`## Some concepts`

			`# http://www.slideshare.net/fullscreen/HadrienGardeur/understanding-opds/7`
			`#`
			`# OPDS is based on`
			`#`
			`# * resources`
			`# * collections`
			`#`
			`# A collection aggregates resources.`
			`#`
			`# Two kinds of resources:`
			`#`
			`# * Navigation link`
			`# * Catalog entry`
			`#`
			`# for two kinds of collections:`
			`#`
			`# * Navigation`
			`# * Acquisition`

			`### Acquisition scenarios`

			`# Multiple acquisition scenarios:`
			`#`
			`# * Open Access`
			`# * Sale`
			`# * Lending`
			`# * Subscription`
			`# * Extract`
			`# * Undefined`

			`# In[ ]:`

			`import requests`
			`from lxml.etree import fromstring`

			`ATOM_NS = "http://www.w3.org/2005/Atom"`

			`def nsq(url, tag):`
			`return "{" + url +"}" + tag`

			`url = "http://bookserver.archive.org/catalog/"`

			`r = requests.get(url)`


			`# In[ ]:`

			`doc=fromstring(r.text)`
			`doc`


			`# In[ ]:`

			`# get links`
			`# what types specified in spec?`

			`[link.attrib for link in doc.findall(nsq(ATOM_NS,'link'))]`


			`# it might be useful to use specialized libraries to handle Atom or AtomPub.`

			`# In[ ]:`

			`doc.findall(nsq(ATOM_NS, "entry"))`


			`## Atom feed generation`

			`# https://github.com/sramana/pyatom`
			`#`
			`# pip install pyatom`

			`# In[ ]:`

			`# let's try the basics of pyatom`
			`# puzzled wwhere <links> come from.`

			`from pyatom import AtomFeed`
			`import datetime`

			`feed = AtomFeed(title="Unglue.it",`
			`subtitle="Unglue.it OPDS Navigation",`
			`feed_url="https://unglue.it/opds",`
			`url="https://unglue.it/",`
			`author="unglue.it")`

			`# Do this for each feed entry`
			`feed.add(title="My Post",`
			`content="Body of my post",`
			`content_type="html",`
			`author="Me",`
			`url="http://example.org/entry1",`
			`updated=datetime.datetime.utcnow())`

			`print feed.to_string()`


			`## Creating navigation feed`

			`# template: https://gist.github.com/rdhyee/94d82f6639809fb7796f#file-unglueit_nav_opds-xml`

			`#`
			# ````xml
			`# <feed xmlns:dcterms="http://purl.org/dc/terms/" xmlns:opds="http://opds-spec.org/"`
			`# xmlns="http://www.w3.org/2005/Atom"`
			`# xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"`
			`# xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml">`
			`# <title>Unglue.it Catalog</title>`
			`# <id>https://unglue.it/opds</id>`
			`# <updated>2014-06-13T21:48:34Z</updated>`
			`# <author>`
			`# <name>unglue.it</name>`
			`# <uri>https://unglue.it</uri>`
			`# </author>`
			`# <!-- crawlable link in archive.org (optional for unglue.it) -->`
			`# <link rel="http://opds-spec.org/crawlable" type="application/atom+xml;profile=opds-catalog;kind=acquisition" href="https://unglue.it/opds/crawlable" title="Crawlable feed"/>`
			`# <link rel="start" href="https://unglue.it/opds" type="application/atom+xml;profile=opds-catalog;kind=navigation" />`
			`# <entry>`
			`# <title>Creative Commons</title>`
			`# <id>https://unglue.it/creativecommons/</id>`
			`# <updated>2014-06-13T00:00:00Z</updated>`
			`# <link href="creativecommons.xml" type="application/atom+xml;profile=opds-catalog;kind=acquisition" />`
			`# <content>These Creative Commons licensed ebooks are ready to read - the people who created them want you to read and share them..</content>`
			`# </entry>`
			`# <entry>`
			`# <title>Active Campaigns</title>`
			`# <id>https://unglue.it/campaigns/ending#2</id>`
			`# <updated>2014-06-13T00:00:00Z</updated>`
			`# <link href="active_campaigns.xml" type="application/atom+xml;profile=opds-catalog;kind=acquisition"/>`
			`# <content>With your help we're raising money to make these books free to the world.</content>`
			`# </entry>`
			# </feed>````

			`# In[ ]:`

			`from lxml import etree`
			`import datetime`
			`import pytz`

			`def text_node(tag, text):`
			`node = etree.Element(tag)`
			`node.text = text`
			`return node`

			`def entry_node(title, id_, updated, link_href, link_type, content):`
			`node = etree.Element("entry")`
			`node.append(text_node("title", title))`
			`node.append(text_node("id", id_))`
			`node.append(text_node("updated", updated))`

			`link_node = etree.Element("link")`
			`link_node.attrib.update({'href':link_href, 'type':link_type})`
			`node.append(link_node)`

			`node.append(text_node("content", content))`
			`return node`

			`feed_xml = """<feed xmlns:dcterms="http://purl.org/dc/terms/"`
			`xmlns:opds="http://opds-spec.org/"`
			`xmlns="http://www.w3.org/2005/Atom"`
			`xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"`
			`xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"`
			`xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd"/>"""`

			`feed = etree.fromstring(feed_xml)`

			`# add title`

			`feed.append(text_node('title', "Unglue.it Catalog"))`

			`# id`

			`feed.append(text_node('id', "https://unglue.it/opds"))`

			`# updated`

			`feed.append(text_node('updated',`
			`pytz.utc.localize(datetime.datetime.utcnow()).isoformat()))`

			`# author`

			`author_node = etree.Element("author")`
			`author_node.append(text_node('name', 'unglue.it'))`
			`author_node.append(text_node('uri', 'https://unglue.it'))`
			`feed.append(author_node)`

			`# start link`

			`start_link = etree.Element("link")`
			`start_link.attrib.update({"rel":"start",`
			`"href":"https://unglue.it/opds",`
			`"type":"application/atom+xml;profile=opds-catalog;kind=navigation",`
			`})`
			`feed.append(start_link)`

			`# crawlable link`

			`crawlable_link = etree.Element("link")`
			`crawlable_link.attrib.update({"rel":"http://opds-spec.org/crawlable",`
			`"href":"https://unglue.it/opds/crawlable",`
			`"type":"application/atom+xml;profile=opds-catalog;kind=acquisition",`
			`"title":"Crawlable feed"})`
			`feed.append(crawlable_link)`

			`# CC entry_node`

			`cc_entry = entry_node(title="Creative Commons",`
			`id_="https://unglue.it/creativecommons/",`
			`updated="2014-06-13T00:00:00Z",`
			`link_href="creativecommons.xml",`
			`link_type="application/atom+xml;profile=opds-catalog;kind=acquisition",`
			`content="These Creative Commons licensed ebooks are ready to read - the people who created them want you to read and share them..")`
			`feed.append(cc_entry)`

			`print etree.tostring(feed, pretty_print=True)`


			`## Writing Crawlable Feed`

			# ````xml
			`# <feed xmlns:dcterms="http://purl.org/dc/terms/" xmlns:opds="http://opds-spec.org/"`
			`# xmlns="http://www.w3.org/2005/Atom"`
			`# xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"`
			`# xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"`
			`# xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd">`
			`# <title>Unglue.it Catalog -- 1 to 1 of 2000 -- crawlable feed</title>`
			`# <id>https://unglue.it/opds/crawlable</id>`
			`# <updated>2014-06-16T00:00:00Z</updated>`
			`# <link rel="start" href="https://unglue.it/opds" type="application/atom+xml;profile=opds-catalog;kind=navigation" />`
			`# <link rel="self" type="application/atom+xml;profile=opds-catalog;kind=acquisition" href="https://unglue.it/opds/crawlable"/>`
			`# <author>`
			`# <name>unglue.it</name>`
			`# <uri>https://unglue.it</uri>`
			`# </author>`
			`# <link rel="next" type="application/atom+xml;profile=opds-catalog;kind=acquisition" href="/opds/crawlable/1" title="Next results"/>`
			`# <entry>`
			`# <title>Oral Literature In Africa</title>`
			`# <id>https://unglue.it/work/81834/</id>`
			`# <updated>2013-07-17T23:27:37Z</updated>`
			`# <link href="https://unglue.it/download_ebook/904/" type="application/pdf" rel="http://opds-spec.org/acquisition"/>`
			`# <link href="https://unglue.it/download_ebook/905/" type="application/epub+zip" rel="http://opds-spec.org/acquisition"/>`
			`# <link href="https://unglue.it/download_ebook/906/" type="application/x-mobipocket-ebook" rel="http://opds-spec.org/acquisition"/>`
			`# <link href="https://unglueit.files.wordpress.com/2012/05/olacover_thumbnail.jpg" type="image/jpeg" rel="http://opds-spec.org/image/thumbnail"/>`
			`# <dcterms:issued>2012</dcterms:issued>`
			`# <author>`
			`# <name>Ruth Finnegan</name>`
			`# </author>`
			`# <category term="Africa"/>`
			`# <category term="African Folk literature"/>`
			`# <category term="Folk literature"/>`
			`# <dcterms:publisher>Open Book Publishers</dcterms:publisher>`
			`# <dcterms:language>en</dcterms:language>`
			`# <content type="html"></content>`
			`# </entry>`
			`# </feed>`
			# ````

			`# In[ ]:`

			`# crawlable feed`

			`from itertools import islice`

			`from lxml import etree`
			`import datetime`
			`import urlparse`

			`import pytz`

			`from regluit.core import models`
			`import regluit.core.cc as cc`

			`licenses = cc.LICENSE_LIST`

			`FORMAT_TO_MIMETYPE = {'pdf':"application/pdf",`
			`'epub':"application/epub+zip",`
			`'mobi':"application/x-mobipocket-ebook",`
			`'html':"text/html",`
			`'text':"text/html"}`

			`def text_node(tag, text):`
			`node = etree.Element(tag)`
			`node.text = text`
			`return node`

			`def map_to_unglueit(url):`
			`m = list(urlparse.urlparse(url))`
			`(m[0], m[1]) = ('https','unglue.it')`
			`return urlparse.urlunparse(m)`

			`def work_node(work):`
			`node = etree.Element("entry")`
			`# title`
			`node.append(text_node("title", work.title))`

			`# id`
			`node.append(text_node('id', "https://unglue.it{0}".format(work.get_absolute_url())))`

			`# updated -- using creation date`
			`node.append(text_node('updated', work.created.isoformat()))`

			`# links for all ebooks`

			`for ebook in work.ebooks():`
			`link_node = etree.Element("link")`
			`link_node.attrib.update({"href":map_to_unglueit(ebook.download_url),`
			`"type":FORMAT_TO_MIMETYPE.get(ebook.format, ""),`
			`"rel":"http://opds-spec.org/acquisition"})`
			`node.append(link_node)`

			`# get the cover -- assume jpg?`

			`cover_node = etree.Element("link")`
			`cover_node.attrib.update({"href":work.cover_image_small(),`
			`"type":"image/jpeg",`
			`"rel":"http://opds-spec.org/image/thumbnail"})`
			`node.append(cover_node)`

			`# <dcterms:issued>2012</dcterms:issued>`
display publication range needs a migration 2015-10-05 23:17:16 +00:00			`node.append(text_node("{http://purl.org/dc/terms/}issued", work.publication_date))`
Now able to create static versions of the navigation and acquisition feeds for all CC books. 2014-06-17 21:48:00 +00:00
			`# author`
			`# TO DO: include all authors?`
			`author_node = etree.Element("author")`
			`author_node.append(text_node("name", work.author()))`
			`node.append(author_node)`

			`# publisher`
			`#<dcterms:publisher>Open Book Publishers</dcterms:publisher>`
			`if len(work.publishers()):`
			`for publisher in work.publishers():`
			`node.append(text_node("{http://purl.org/dc/terms/}issued", publisher.name.name))`

			`# language`
			`#<dcterms:language>en</dcterms:language>`
			`node.append(text_node("{http://purl.org/dc/terms/}language", work.language))`

			`# subject tags`
			`# [[subject.name for subject in work.subjects.all()] for work in ccworks if work.subjects.all()]`
			`if work.subjects.all():`
			`for subject in work.subjects.all():`
			`category_node = etree.Element("category")`
			`category_node.attrib["term"] = subject.name`
			`node.append(category_node)`

			`return node`

			`feed_xml = """<feed xmlns:dcterms="http://purl.org/dc/terms/"`
			`xmlns:opds="http://opds-spec.org/"`
			`xmlns="http://www.w3.org/2005/Atom"`
			`xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"`
			`xsi:noNamespaceSchemaLocation="http://www.kbcafe.com/rss/atom.xsd.xml"`
			`xsi:schemaLocation="http://purl.org/dc/elements/1.1/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dc.xsd http://purl.org/dc/terms/ http://dublincore.org/schemas/xmls/qdc/2008/02/11/dcterms.xsd"/>"""`

			`feed = etree.fromstring(feed_xml)`

			`# add title`
			`# TO DO: will need to calculate the number items and where in the feed we are`

			`feed.append(text_node('title', "Unglue.it Catalog: crawlable feed"))`

			`# id`

			`feed.append(text_node('id', "https://unglue.it/opds/crawlable"))`

			`# updated`
			`# TO DO: fix time zone?`

			`feed.append(text_node('updated',`
			`pytz.utc.localize(datetime.datetime.utcnow()).isoformat()))`

			`# author`

			`author_node = etree.Element("author")`
			`author_node.append(text_node('name', 'unglue.it'))`
			`author_node.append(text_node('uri', 'https://unglue.it'))`
			`feed.append(author_node)`

			`# links: start, self, next/prev (depending what's necessary -- to start with put all CC books)`

			`# start link`

			`start_link = etree.Element("link")`
			`start_link.attrib.update({"rel":"start",`
			`"href":"https://unglue.it/opds",`
			`"type":"application/atom+xml;profile=opds-catalog;kind=navigation",`
			`})`
			`feed.append(start_link)`

			`# self link`

			`self_link = etree.Element("link")`
			`self_link.attrib.update({"rel":"self",`
			`"href":"https://unglue.it/opds/crawlable",`
			`"type":"application/atom+xml;profile=opds-catalog;kind=acquisition",`
			`})`
			`feed.append(self_link)`

			`licenses = cc.LICENSE_LIST`

			`ccworks = models.Work.objects.filter(editions__ebooks__isnull=False,`
			`editions__ebooks__rights__in=licenses).distinct().order_by('-created')`

			`for work in islice(ccworks,None):`
			`node = work_node(work)`
			`feed.append(node)`

			`print etree.tostring(feed, pretty_print=True)`


			`# In[ ]:`

			`# how to get CC books?`
			`# make use of CCListView: https://github.com/Gluejar/regluit/blob/b675052736f79dcb8d84ddc6349c99fa392fa9bc/frontend/views.py#L878`
			`# template: https://github.com/Gluejar/regluit/blob/b675052736f79dcb8d84ddc6349c99fa392fa9bc/frontend/templates/cc_list.html`

			`from regluit.core import models`
			`import regluit.core.cc as cc`

			`licenses = cc.LICENSE_LIST`

			`ccworks = models.Work.objects.filter(editions__ebooks__isnull=False,`
			`editions__ebooks__rights__in=licenses).distinct().order_by('-created')`
			`ccworks`


			`# In[ ]:`

			`dir(ccworks[0])`


			`# In[ ]:`

			`work = ccworks[0]`
			`ebook = work.ebooks()[0]`
			`dir(ebook)`


			`# In[ ]:`

			`from collections import Counter`

			`c = Counter()`

			`for work in islice(ccworks,None):`
			`c.update([ebook.format for ebook in work.ebooks()])`

			`print c`

			`#[[ebook.format for ebook in work.ebooks()] for work in islice(ccworks,1)]`


			`## Appendix: dealing with namespaces in ElementTree`

			`# Maybe come back to http://effbot.org/zone/element-namespaces.htm for more sophisticated ways to register namespaces.`