regluit/notebooks/mobigen_integrate.ipynb

701 lines
19 KiB
Plaintext
Raw Normal View History

{
"metadata": {
"name": "",
2014-12-16 21:46:24 +00:00
"signature": "sha256:5f0de387b7cbfe304b5cd1aa8db4f63ffa60fc4e3f17b1980a7fe3cb473b02cd"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# compute whether we can apply mobigen to a given edition to produce a mobi file\n",
"# need to have an ebook in epub or pdf format \n",
"# possible return values: already has a mobi file / can generate a mobi file / not possible\n",
"\n",
"def edition_mobi_status(edition):\n",
" \"\"\"\n",
" for a given edition, return 1 if there is a mobi ebook, 0 if there is none but we have an epub or html to convert from,\n",
" and -1 for no epub/html to convert from\n",
" \"\"\"\n",
" formats = set([ebook.format for ebook in edition.ebooks.all()])\n",
" if 'mobi' in formats:\n",
" return 1\n",
" elif ('epub' in formats) or ('html' in formats):\n",
" return 0\n",
" else:\n",
" return -1"
],
"language": "python",
"metadata": {},
2014-12-16 21:46:24 +00:00
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import Counter\n",
"from regluit.core.models import Edition\n",
"\n",
"# of all the Editions with ebook, compute the \"mobi status\"\n",
"Counter([edition_mobi_status(edition) for edition in Edition.objects.filter(ebooks__isnull=False).all()])"
],
"language": "python",
"metadata": {},
2014-12-16 21:46:24 +00:00
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# looking only at Campaign related ebooks\n",
"\n",
"from regluit.core.models import Campaign\n",
"\n",
"Counter([edition_mobi_status(campaign.edition) for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct()])"
],
"language": "python",
"metadata": {},
2014-12-16 21:46:24 +00:00
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import requests\n",
"r = requests.get(\"https://archive.org/download/Feeding_the_City/9781909254039_Feeding_the_City.epub\", verify=False)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core.mobigen import convert_to_mobi\n",
"\n",
"output = convert_to_mobi(\"https://archive.org/download/Feeding_the_City/9781909254039_Feeding_the_City.epub\")\n",
"\n",
"with open(\"/Users/raymondyee/Downloads/test.mobi\", \"wb\") as f:\n",
" f.write(output)\n"
],
"language": "python",
"metadata": {},
2014-12-11 16:43:03 +00:00
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!ls -lt /Users/raymondyee/Downloads/test.mobi"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# let's remind ourselves about how to write to S3 and integrate an ebook record\n",
"# https://github.com/Gluejar/regluit/blob/36793b0b118fd97b52ab0d8637e8e34ab6d8672e/core/models.py#L1776\n",
"\n",
"#ebf holding the books we generate in the watermarking/custom messaging process, right?\n",
"\n",
"from regluit.core.models import EbookFile\n",
"\n",
"for ebookfile in EbookFile.objects.all():\n",
" print (ebookfile, ebookfile.file, ebookfile.edition.title)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# how to work with EbookFile\n",
"# https://github.com/Gluejar/regluit/blob/792659c325a7bee2b49337408336fdeadab3464a/core/models.py#L904\n",
"# Campaign."
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct():\n",
2014-12-16 21:46:24 +00:00
" #print (campaign.edition.title, edition_mobi_status(campaign.edition))\n",
" if edition_mobi_status(campaign.edition) == 0: # possible to generate mobi\n",
2014-12-16 21:46:24 +00:00
" print(campaign.edition.title, campaign.edition.ebooks.filter(format='epub')[0].url)"
],
"language": "python",
"metadata": {},
2014-12-11 16:43:03 +00:00
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pledge, b2u, t4u"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"c = Campaign.objects.all()[0]\n",
"c.edition.ebooks.all(), c.type"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Ebook.objects.filter(format='epub').filter(edition__id=202594)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core.models import Identifier\n",
"Identifier.objects.filter(edition__isnull=False).filter(~Q(edition__work__id = F('work__id'))).count()"
],
"language": "python",
"metadata": {},
"outputs": []
2014-12-11 16:43:03 +00:00
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Using the low level Django file storage API: File Storage API"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I need to remind myself about how the [File Storage API](https://docs.djangoproject.com/en/1.4/ref/files/storage/) works.\n",
"\n",
"We can use `default_storage` directly to read, write [file storage objects](https://docs.djangoproject.com/en/1.4/topics/files/#storage-objects) and to test for existence.\n",
"\n",
"I'm a bit unclear about the relevance of [FileField](https://docs.djangoproject.com/en/1.4/ref/models/fields/#filefield).\n",
"\n",
"And how does Ebookfile work? Look at https://github.com/Gluejar/regluit/blob/f7b796c6a6d220f6475dbfdc0a8aeb16a09e84b1/core/models.py#L1777:\n",
"\n",
"```python\n",
"class EbookFile(models.Model):\n",
" file = models.FileField(upload_to=path_for_file)\n",
"``` \n",
"\n",
"I should be able to find hints about how to instantiate an `EbookFile` in the right way."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"BTW, is there a tension between the standard API and extras that are being used by S3storages https://django-storages.readthedocs.org/en/latest/backends/amazon-S3.html?"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from django.core.files.storage import default_storage\n",
"from django.core.files.base import ContentFile, File\n",
2014-12-16 21:46:24 +00:00
"#from django.core.cache import cache"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
2014-12-16 21:46:24 +00:00
"outputs": []
2014-12-11 16:43:03 +00:00
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"default_storage.listdir(\"/\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# but I can't use exists to test existence of S3 folder\n",
"default_storage.exists('ebf'), default_storage.exists('/ebf'), default_storage.exists('/ebf/')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# can test existence of files, but not appa\n",
"default_storage.exists(\"/Users/rdhyee/covers/52/AWizardOfEarthsea(1stEd).jpg\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# getting key pairs\n",
"default_storage.bucket.get_all_keys()"
],
"language": "python",
"metadata": {},
"outputs": []
},
2014-12-16 21:46:24 +00:00
{
"cell_type": "code",
"collapsed": false,
"input": [
"k = default_storage.bucket.get_all_keys()[0]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"k.url"
],
"language": "python",
"metadata": {},
"outputs": []
},
2014-12-11 16:43:03 +00:00
{
"cell_type": "code",
"collapsed": false,
"input": [
"# I was expecting true\n",
"default_storage.exists('ebf')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"default_storage.exists('storage_test')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"file = default_storage.open('storage_test', 'w')\n",
"file.write('storage contents')\n",
"file.close()\n",
"\n",
"default_storage.exists('storage_test')"
],
"language": "python",
"metadata": {},
"outputs": []
},
2014-12-16 21:46:24 +00:00
{
"cell_type": "code",
"collapsed": false,
"input": [
"file."
],
"language": "python",
"metadata": {},
"outputs": []
},
2014-12-11 16:43:03 +00:00
{
"cell_type": "code",
"collapsed": false,
"input": [
"file = default_storage.open('storage_test', 'r')\n",
"file.read()\n",
"file.close()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"default_storage.delete('storage_test')\n",
"default_storage.exists('storage_test')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# clean up some files\n",
"\n",
"print (default_storage.bucket)\n",
"\n",
"for key in default_storage.listdir(\"/ebf\")[1]:\n",
" print default_storage.delete(\"/ebf/\" + key)\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"How to write the results of the conversion"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"from regluit.core.models import EbookFile\n",
"from django.core.files.storage import default_storage\n",
"from django.core.files.base import ContentFile, File\n",
"\n",
"# http://stackoverflow.com/a/519653\n",
"\n",
"def read_in_chunks(file_object, chunk_size=1024):\n",
" \"\"\"Lazy function (generator) to read a file piece by piece.\n",
" Default chunk size: 1k.\"\"\"\n",
" while True:\n",
" data = file_object.read(chunk_size)\n",
" if not data:\n",
" break\n",
" yield data"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"md_book = open(\"/Users/raymondyee/Downloads/hello.mobi\", \"rb\")\n",
"md_s3 = File(md_book)\n",
"md_s3.content_type = \"application/x-mobipocket-ebook\"\n",
"\n",
"default_storage.save(\"/ebf/hello.mobi\", md_s3)"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"def write_file_to_storage(file_object, content_type, path):\n",
" file_s3 = File(file_object)\n",
" file_s3.content_type = content_type\n",
" \n",
" default_storage.save(path, file_s3)\n",
" return file_s3"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"file_ = write_file_to_storage(open(\"/Users/raymondyee/Downloads/hello.mobi\"), \n",
" \"application/x-mobipocket-ebook\", \n",
" \"/ebf/hello.mobi\")\n",
"\n"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
2014-12-16 21:46:24 +00:00
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"file_.name"
2014-12-11 16:43:03 +00:00
],
2014-12-16 21:46:24 +00:00
"language": "python",
"metadata": {},
"outputs": []
2014-12-11 16:43:03 +00:00
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"default_storage.url(file_.name)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Sample: write a sample mobi file as an Ebook for a campaign book"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct():\n",
" #print (campaign.edition.title, edition_mobi_status(campaign.edition))\n",
" if edition_mobi_status(campaign.edition) == 0: # possible to generate mobi\n",
" print(campaign.edition.title, campaign.edition.ebooks.filter(format='epub')[0].url)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# generator for editions to add mobi to\n",
2014-12-11 16:43:03 +00:00
"\n",
2014-12-16 21:46:24 +00:00
"def editions_to_convert():\n",
" for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct():\n",
" #print (campaign.edition.title, edition_mobi_status(campaign.edition))\n",
" if edition_mobi_status(campaign.edition) == 0: # possible to generate mobi\n",
" yield campaign.edition\n",
" "
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"list(editions_to_convert())"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import uuid"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from itertools import islice\n",
"for edition in islice(editions_to_convert(),1):\n",
" print (edition)\n",
" \n",
" # pull out the sister edition to convert from\n",
" sister_ebook = edition.ebooks.filter(format__in=['epub', 'pdf'])[0]\n",
" \n",
" # run the conversion process\n",
" # output = StringIO(convert_to_mobi(sister_ebook.url))\n",
" output = open(\"/Users/raymondyee/Downloads/hello.mobi\")\n",
" file_ = write_file_to_storage(output, \n",
" \"application/x-mobipocket-ebook\", \n",
" \"/ebf/hello.mobi\")\n",
" \n",
" # create a path for the ebookfile: \n",
" # https://github.com/Gluejar/regluit/blob/25dcb06f464dc11b5e589ab6859dfcc487f8f3ef/core/models.py#L1771\n",
" \n",
" ebfile = EbookFile(edition=edition, file=file_, format='mobi')\n",
" ebfile.save()\n",
2014-12-11 16:43:03 +00:00
"\n",
2014-12-16 21:46:24 +00:00
" # maybe need to create an ebook pointing to ebookFile ?\n",
" # copy metadata from sister ebook\n",
" \n",
" ebfile_url = default_storage.url(file_.name)\n",
" print (ebfile_url)\n",
" \n",
" ebook = Ebook(url=ebfile_url,\n",
" format=\"mobi\", \n",
" provider=\"Unglue.it\",\n",
" rights=sister_ebook.rights, \n",
" edition=edition)\n",
" ebook.save()"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"# from django.db.models import Q\n",
"# Item.objects.filter(creator__in=creators)\n",
"edition.ebooks.filter(format__in=['epub', 'pdf'])[0]"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"edition_mobi_status(edition)"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-16 21:46:24 +00:00
"# check connection between edition and Ebook, Ebookfile before creating mobi.\n",
"[(ebook.id, edition.ebooks.all(), edition.ebook_files.all()) for ebook in edition.ebooks.all()]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"[(eb.url, eb.format, eb.provider, eb.rights, eb.edition) for eb in edition.ebooks.all()]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# checking on relationship between Ebookfile and Ebook\n",
2014-12-11 16:43:03 +00:00
"\n",
2014-12-16 21:46:24 +00:00
"[ebf for ebf in EbookFile.objects.all() if ebf.active]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edition.ebooks.all()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edition.ebook_files.all()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ebf.save()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ebf.id"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"ebf.edition.ebooks.all()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"edition_mobi_status(edition)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
2014-12-11 16:43:03 +00:00
"\n",
"\n",
2014-12-16 21:46:24 +00:00
"ebf1= EbookFile(edition=e1, file=file_, format='mobi')"
2014-12-11 16:43:03 +00:00
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}