regluit/notebooks/mobigen_integrate.ipynb

237 lines
7.4 KiB
Plaintext

{
"metadata": {
"name": "",
"signature": "sha256:c3439815c10b9795d3d95691a1edd9789672f61dc0b415cd57b8e06d0552f8a5"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# compute whether we can apply mobigen to a given edition to produce a mobi file\n",
"# need to have an ebook in epub or pdf format \n",
"# possible return values: already has a mobi file / can generate a mobi file / not possible\n",
"\n",
"def edition_mobi_status(edition):\n",
" \"\"\"\n",
" for a given edition, return 1 if there is a mobi ebook, 0 if there is none but we have an epub or html to convert from,\n",
" and -1 for no epub/html to convert from\n",
" \"\"\"\n",
" formats = set([ebook.format for ebook in edition.ebooks.all()])\n",
" if 'mobi' in formats:\n",
" return 1\n",
" elif ('epub' in formats) or ('html' in formats):\n",
" return 0\n",
" else:\n",
" return -1"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import Counter\n",
"from regluit.core.models import Edition\n",
"\n",
"# of all the Editions with ebook, compute the \"mobi status\"\n",
"Counter([edition_mobi_status(edition) for edition in Edition.objects.filter(ebooks__isnull=False).all()])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# looking only at Campaign related ebooks\n",
"\n",
"from regluit.core.models import Campaign\n",
"\n",
"Counter([edition_mobi_status(campaign.edition) for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct()])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import requests\n",
"r = requests.get(\"https://archive.org/download/Feeding_the_City/9781909254039_Feeding_the_City.epub\", verify=False)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core.mobigen import convert_to_mobi\n",
"\n",
"output = convert_to_mobi(\"https://archive.org/download/Feeding_the_City/9781909254039_Feeding_the_City.epub\")\n",
"\n",
"with open(\"/Users/raymondyee/Downloads/test.mobi\", \"wb\") as f:\n",
" f.write(output)\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/Users/raymondyee/anaconda/envs/regluit/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.py:730: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.org/en/latest/security.html (This warning will only appear once by default.)\n",
" InsecureRequestWarning)\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!ls -lt /Users/raymondyee/Downloads/test.mobi"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# let's remind ourselves about how to write to S3 and integrate an ebook record\n",
"# https://github.com/Gluejar/regluit/blob/36793b0b118fd97b52ab0d8637e8e34ab6d8672e/core/models.py#L1776\n",
"\n",
"#ebf holding the books we generate in the watermarking/custom messaging process, right?\n",
"\n",
"from regluit.core.models import EbookFile\n",
"\n",
"for ebookfile in EbookFile.objects.all():\n",
" print (ebookfile, ebookfile.file, ebookfile.edition.title)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# how to work with EbookFile\n",
"# https://github.com/Gluejar/regluit/blob/792659c325a7bee2b49337408336fdeadab3464a/core/models.py#L904\n",
"# Campaign."
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct():\n",
" print (campaign.edition.title, edition_mobi_status(campaign.edition))\n",
" if edition_mobi_status(campaign.edition) == 0: # possible to generate mobi\n",
" print(campaign.edition.ebooks.filter(format='epub')[0].url)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(u'Open Access eBooks', 1)\n",
"(u'Oral Literature in Africa', 1)\n",
"(u'The Third Awakening', 1)\n",
"(u'Feeding the City', 0)\n",
"https://archive.org/download/Feeding_the_City/9781909254039_Feeding_the_City.epub\n",
"(u'Complex Predicates', -1)\n",
"(u'Flatland', 1)\n",
"(u'Green Comet', 1)\n",
"(u'23rd Century Romance', 1)\n",
"(u'Moebius Noodles', -1)\n",
"(u'The Classic Short Story, 1870-1925: Theory of a Genre', -1)\n",
"(u'The Pains', -1)\n",
"(u'The Global Librarian', 0)\n",
"https://unglueit-files.s3.amazonaws.com/ebf/619c98c3192c695caabdce71766e7245.epub\n",
"(u'Heaven - The Afterlife Series I', 1)\n",
"(u'Digitization in the Real World', -1)\n",
"(u'Zero Sum Game', 1)"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"(u'Introduction to High Performance Scientific Computing', -1)\n",
"(u'Libres conseils : Ce que nous aurions aime\\u0301 savoir avant de commencer', 1)\n",
"(u'Option Libre', 0)\n",
"https://unglueit-files.s3.amazonaws.com/ebf/830cac2e0b26dfe576e6658623f6243a.epub\n",
"(u'Libres conseils. Ce que nous aurions aim\\xe9 savoir avant de commencer', 0)\n",
"https://unglueit-files.s3.amazonaws.com/ebf/f35b38527140a26cf44aa37bf540f24f.epub\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pledge, b2u, t4u"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"c = Campaign.objects.all()[0]\n",
"c.edition.ebooks.all(), c.type"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Ebook.objects.filter(format='epub').filter(edition__id=202594)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core.models import Identifier\n",
"Identifier.objects.filter(edition__isnull=False).filter(~Q(edition__work__id = F('work__id'))).count()"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}