regluit/pyepub/__init__.py

425 lines
17 KiB
Python
Raw Normal View History

2013-09-16 01:43:01 +00:00
import zipfile
import os
import re
import uuid
from StringIO import StringIO
import datetime
import logging
2013-09-16 01:43:01 +00:00
try:
import lxml.etree as ET
except ImportError:
import xml.etree.ElementTree as ET
logger = logging.getLogger(__name__)
2013-09-16 01:43:01 +00:00
NAMESPACE = {
"dc": "{http://purl.org/dc/elements/1.1/}",
"opf": "{http://www.idpf.org/2007/opf}",
"ncx": "{http://www.daisy.org/z3986/2005/ncx/}"
}
ET.register_namespace('dc', "http://purl.org/dc/elements/1.1/")
ET.register_namespace('opf', "http://www.idpf.org/2007/opf")
ET.register_namespace('ncx', "http://www.daisy.org/z3986/2005/ncx/")
class InvalidEpub(Exception):
pass
class EPUB(zipfile.ZipFile):
"""
EPUB file representation class.
"""
2013-09-16 01:43:01 +00:00
def __init__(self, filename, mode="r"):
"""
Global Init Switch
:type filename: str or StringIO() or file like object for read or add
:param filename: File to be processed
:type mode: str
:param mode: "w" or "r", mode to init the zipfile
"""
self._write_files = {} # a dict of files written to the archive
self._delete_files = [] # a list of files to delete from the archive
self.epub_mode = mode
self.writename = None
2013-09-16 01:43:01 +00:00
if mode == "w":
if isinstance(filename, str):
self.writename = open(filename, "w") # on close, we'll overwrite on this file
else:
# filename is already a file like object
self.writename = filename
dummy= StringIO()
zipfile.ZipFile.__init__(self, dummy, mode="w") # fake
2013-09-16 01:43:01 +00:00
self.__init__write()
elif mode == "a":
# we're not going to write to the file until the very end
2013-09-16 01:43:01 +00:00
if isinstance(filename, str):
self.filename = open(filename, "w") # on close, we'll overwrite on this file
2013-09-16 01:43:01 +00:00
else:
# filename is already a file like object
self.filename = filename
self.filename.seek(0)
temp = StringIO()
temp.write(self.filename.read())
zipfile.ZipFile.__init__(self, self.filename, mode="r") # r mode doesn't set the filename
self.__init__read(temp)
2013-09-16 01:43:01 +00:00
else: # retrocompatibility?
zipfile.ZipFile.__init__(self, filename, mode="r")
self.__init__read(filename)
def __init__read(self, filename):
"""
Constructor to initialize the zipfile in read-only mode
:type filename: str or StringIO()
:param filename: File to be processed
"""
self.filename = filename
try:
# Read the container
f = self.read("META-INF/container.xml")
except KeyError:
# By specification, there MUST be a container.xml in EPUB
logger.warning("The %s file is not a valid OCF." % str(filename))
2013-09-16 01:43:01 +00:00
raise InvalidEpub
try:
# There MUST be a full path attribute on first grandchild...
self.opf_path = ET.fromstring(f)[0][0].get("full-path")
except IndexError:
# ...else the file is invalid.
logger.warning("The %s file is not a valid OCF." % str(filename))
2013-09-16 01:43:01 +00:00
raise InvalidEpub
# NEW: json-able info tree
self.info = {"metadata": {},
"manifest": [],
"spine": [],
"guide": []}
self.root_folder = os.path.dirname(self.opf_path) # Used to compose absolute paths for reading in zip archive
self.opf = ET.fromstring(self.read(self.opf_path)) # OPF tree
ns = re.compile(r'\{.*?\}') # RE to strip {namespace} mess
# Iterate over <metadata> section, fill EPUB.info["metadata"] dictionary
for i in self.opf.find("{0}metadata".format(NAMESPACE["opf"])):
2018-07-30 00:56:05 +00:00
if i.tag and isinstance(i.tag, str):
tag = ns.sub('', i.tag)
if tag not in self.info["metadata"]:
self.info["metadata"][tag] = i.text or i.attrib
else:
self.info["metadata"][tag] = [self.info["metadata"][tag], i.text or i.attrib]
2013-09-16 01:43:01 +00:00
# Get id of the cover in <meta name="cover" />
try:
coverid = self.opf.find('.//{0}meta[@name="cover"]'.format(NAMESPACE["opf"])).get("content")
except AttributeError:
# It's a facultative field, after all
coverid = None
self.cover = coverid # This is the manifest ID of the cover
self.info["manifest"] = [{"id": x.get("id"), # Build a list of manifest items
"href": x.get("href"),
"mimetype": x.get("media-type")}
for x in self.opf.find("{0}manifest".format(NAMESPACE["opf"])) if x.get("id")]
self.info["spine"] = [{"idref": x.get("idref")} # Build a list of spine items
for x in self.opf.find("{0}spine".format(NAMESPACE["opf"])) if x.get("idref")]
try:
self.info["guide"] = [{"href": x.get("href"), # Build a list of guide items
"type": x.get("type"),
"title": x.get("title")}
for x in self.opf.find("{0}guide".format(NAMESPACE["opf"])) if x.get("href")]
except TypeError: # The guide element is optional
self.info["guide"] = None
# Document identifier
try:
self.id = self.opf.find('.//{0}identifier[@id="{1}"]'.format(NAMESPACE["dc"],
self.opf.get("unique-identifier"))).text
except AttributeError:
2013-11-05 05:57:15 +00:00
raise InvalidEpub("Cannot process an EPUB without unique-identifier attribute of the package element")
2013-09-16 01:43:01 +00:00
# Get and parse the TOC
toc_id = self.opf[2].get("toc")
if toc_id:
expr = ".//{0}item[@id='{1:s}']".format(NAMESPACE["opf"], toc_id)
else:
expr = ".//{0}item[@properties='nav']".format(NAMESPACE["opf"])
toc_name = self.opf.find(expr).get("href")
2013-09-16 01:43:01 +00:00
self.ncx_path = os.path.join(self.root_folder, toc_name)
self.ncx = ET.fromstring(self.read(self.ncx_path))
self.contents = [{"name": i[0][0].text or "None", # Build a list of toc elements
"src": os.path.join(self.root_folder, i[1].get("src")),
"id":i.get("id")}
for i in self.ncx.iter("{0}navPoint".format(NAMESPACE["ncx"]))] # The iter method
# loops over nested
2013-09-16 01:43:01 +00:00
def __init__write(self):
"""
Init an empty EPUB
"""
self.opf_path = "OEBPS/content.opf" # Define a default folder for contents
self.ncx_path = "OEBPS/toc.ncx"
self.root_folder = "OEBPS"
self.uid = '%s' % uuid.uuid4()
self.info = {"metadata": {},
"manifest": [],
"spine": [],
"guide": []}
self.info["metadata"]["creator"] = "py-clave server"
self.info["metadata"]["title"] = ""
self.info["metadata"]["language"] = ""
self.opf = ET.fromstring(self._init_opf()) # opf property is always a ElementTree
self.ncx = ET.fromstring(self._init_ncx()) # so is ncx. Consistent with self.(opf|ncx) built by __init_read()
self.writestr(self.opf_path, ET.tostring(self.opf, encoding="UTF-8")) # temporary opf & ncx
self.writestr(self.ncx_path, ET.tostring(self.ncx, encoding="UTF-8")) # will be re-init on close()
@property
def author(self):
return self.info["metadata"]["creator"]
@author.setter
def author(self, value):
tmp = self.opf.find(".//{0}creator".format(NAMESPACE["dc"]))
tmp.text = value
self.info["metadata"]["creator"] = value
@property
def title(self):
return self.info["metadata"]["title"]
@title.setter
def title(self, value):
tmp = self.opf.find(".//{0}title".format(NAMESPACE["dc"]))
tmp.text = value
ncx_title = self.ncx.find("{http://www.daisy.org/z3986/2005/ncx/}docTitle")[0]
ncx_title.text = value
self.info["metadata"]["title"] = value
@property
def language(self):
return self.info["metadata"]["language"]
@language.setter
def language(self, value):
tmp = self.opf.find(".//{0}language".format(NAMESPACE["dc"]))
tmp.text = value
self.info["metadata"]["language"] = value
2013-09-16 01:43:01 +00:00
def close(self):
if self.fp is None: # Check file status
return
if self.mode == "r": # check file mode
zipfile.ZipFile.close(self)
return
else:
try:
self._safeclose()
zipfile.ZipFile.close(self) # give back control to superclass close method
except RuntimeError: # zipfile.__del__ destructor calls close(), ignore
return
def _safeclose(self):
"""
Preliminary operations before closing an EPUB
Writes the empty or modified opf-ncx files before closing the zipfile
"""
if self.epub_mode == 'w':
self.writetodisk(self.writename)
else:
self.writetodisk(self.filename)
def _write_epub_zip(self, epub_zip):
"""
writes the epub to the specified writable zipfile instance
:type epub_zip: an empty instance of zipfile.Zipfile, mode=w
:param epub_zip: zip file to write
"""
epub_zip.writestr('mimetype', "application/epub+zip") # requirement of epub container format
epub_zip.writestr('META-INF/container.xml', self._containerxml())
epub_zip.writestr(self.opf_path, ET.tostring(self.opf, encoding="UTF-8"))
epub_zip.writestr(self.ncx_path, ET.tostring(self.ncx, encoding="UTF-8"))
paths = ['mimetype','META-INF/container.xml',self.opf_path,self.ncx_path]+ self._write_files.keys() + self._delete_files
if self.epub_mode != 'w':
for item in self.infolist():
if item.filename not in paths:
epub_zip.writestr(item.filename, self.read(item.filename))
for key in self._write_files.keys():
epub_zip.writestr(key, self._write_files[key])
2013-09-16 01:43:01 +00:00
def _init_opf(self):
"""
Constructor for empty OPF
:type return: xml.minidom.Document
:return: xml.minidom.Document
"""
today = datetime.date.today()
opf_tmpl = """<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId" version="2.0">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
<dc:identifier id="BookId" opf:scheme="UUID">{uid}</dc:identifier>
<dc:title></dc:title>
<dc:creator></dc:creator>
<dc:language></dc:language>
2013-09-16 01:43:01 +00:00
<dc:date opf:event="modification">{date}</dc:date>
</metadata>
<manifest>
<item href="toc.ncx" id="ncx" media-type="application/x-dtbncx+xml" />
</manifest>
<spine toc="ncx">
</spine>
<guide>
</guide>
</package>"""
doc = opf_tmpl.format(uid=self.uid, date=today)
2013-09-16 01:43:01 +00:00
return doc
def _init_ncx(self):
"""
Constructor for empty OPF
:type return: xml.minidom.Document
:return: xml.minidom.Document
"""
ncx_tmpl = """<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="{uid}" />
<meta name="dtb:depth" content="0" />
<meta name="dtb:totalPageCount" content="0" />
<meta name="dtb:maxPageNumber" content="0" />
</head>
<docTitle>
<text>{title}</text>
</docTitle>
<navMap>
</navMap>
</ncx>"""
ncx = ncx_tmpl.format(uid=self.uid, title="Default")
return ncx
def _containerxml(self):
template = """<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0"
xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="%s"
media-type="application/oebps-package+xml"/>
</rootfiles>
</container>"""
return template % self.opf_path
def _delete(self, *paths):
"""
Delete archive member
:type paths: [str]
2013-09-16 01:43:01 +00:00
:param paths: files to be deleted inside EPUB file
"""
for path in paths:
try:
del self._write_files[path]
except KeyError:
pass
self._delete_files.append(path)
2013-09-20 21:44:37 +00:00
def addmetadata(self, term, value, namespace='dc'):
"""
Add an metadata entry
:type term: str
:param term: element name/tag for metadata item
:type value: str
:param value: a value
:type namespace: str
:param namespace. either a '{URI}' or a registered prefix ('dc', 'opf', 'ncx') are currently built-in
"""
assert self.epub_mode != "r", "%s is not writable" % self
2013-09-20 21:44:37 +00:00
namespace = NAMESPACE.get(namespace,namespace)
element = ET.Element(namespace+term, attrib={})
element.text = value
self.opf[0].append(element)
# note that info is ignoring namespace entirely
if self.info["metadata"].has_key(term):
self.info["metadata"][term] = [self.info["metadata"][term] , value]
else:
self.info["metadata"][term] = value
def _writestr(self, filepath, filebytes):
self._write_files[filepath] = filebytes
2013-09-20 21:44:37 +00:00
2013-09-16 01:43:01 +00:00
def additem(self, fileObject, href, mediatype):
"""
Add a file to manifest only
:type fileObject: StringIO
:param fileObject:
:type href: str
:param href:
:type mediatype: str
:param mediatype:
"""
assert self.epub_mode != "r", "%s is not writable" % self
element = ET.Element(NAMESPACE.get("opf")+"item",
2013-09-16 01:43:01 +00:00
attrib={"id": "id_"+str(uuid.uuid4())[:5], "href": href, "media-type": mediatype})
try:
self._writestr(os.path.join(self.root_folder, element.attrib["href"]), fileObject.getvalue().encode('utf-8'))
2013-09-16 01:43:01 +00:00
except AttributeError:
self._writestr(os.path.join(self.root_folder, element.attrib["href"]), fileObject)
2013-09-16 01:43:01 +00:00
self.opf[1].append(element)
return element.attrib["id"]
def addpart(self, fileObject, href, mediatype, position=None, reftype="text", linear="yes"):
"""
Add a file as part of the epub file, i.e. to manifest and spine (and guide?)
:param fileObject: file to be inserted
:param href: path inside the epub archive
:param mediatype: mimetype of the fileObject
:type position: int
:param position: order in spine [from 0 to len(opf/manifest))]
:param linear: linear="yes" or "no"
:param reftype: type to assign in guide/reference
"""
assert self.epub_mode != "r", "%s is not writable" % self
2013-09-16 01:43:01 +00:00
fileid = self.additem(fileObject, href, mediatype)
itemref = ET.Element(NAMESPACE.get("opf")+"itemref", attrib={"idref": fileid, "linear": linear})
reference = ET.Element(NAMESPACE.get("opf")+"reference", attrib={"title": href, "href": href, "type": reftype})
2013-09-16 01:43:01 +00:00
if position is None or position>len(self.opf[2]):
self.opf[2].append(itemref)
if self.info["guide"]:
self.opf[3].append(reference)
2013-09-16 01:43:01 +00:00
else:
self.opf[2].insert(position, itemref)
if self.info["guide"] and len(self.opf[3]) >= position+1:
self.opf[3].insert(position, reference)
2013-09-16 01:43:01 +00:00
def writetodisk(self, filename):
"""
Writes the in-memory archive to disk
:type filename: str
:param filename: name of the file to be writte
"""
if isinstance(filename, str):
filename = open(filename,'w')
filename.seek(0)
new_zip = zipfile.ZipFile(filename, 'w')
self._write_epub_zip(new_zip)
new_zip.close()