From eb7bba0eaaf303742076102d10ac08d450945d66 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Thu, 18 Dec 2014 14:52:06 -0500 Subject: [PATCH] work in progress --- notebooks/mobigen_integrate.ipynb | 481 +++++++++++------------------- 1 file changed, 180 insertions(+), 301 deletions(-) diff --git a/notebooks/mobigen_integrate.ipynb b/notebooks/mobigen_integrate.ipynb index a88a8cec..349bc8e4 100644 --- a/notebooks/mobigen_integrate.ipynb +++ b/notebooks/mobigen_integrate.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:5f0de387b7cbfe304b5cd1aa8db4f63ffa60fc4e3f17b1980a7fe3cb473b02cd" + "signature": "sha256:f02ae8a8cd487879963980d4c8030c6f3082ce0cdb55608e6ae28290f1d2fb8c" }, "nbformat": 3, "nbformat_minor": 0, @@ -21,7 +21,7 @@ " for a given edition, return 1 if there is a mobi ebook, 0 if there is none but we have an epub or html to convert from,\n", " and -1 for no epub/html to convert from\n", " \"\"\"\n", - " formats = set([ebook.format for ebook in edition.ebooks.all()])\n", + " formats = set([ebook.format for ebook in edition.work.ebooks()])\n", " if 'mobi' in formats:\n", " return 1\n", " elif ('epub' in formats) or ('html' in formats):\n", @@ -31,51 +31,71 @@ ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [], + "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ - "from collections import Counter\n", - "from regluit.core.models import Edition\n", + "# generator for editions to add mobi to\n", + "# campaigns that can have mobi files but don't yet.\n", "\n", - "# of all the Editions with ebook, compute the \"mobi status\"\n", - "Counter([edition_mobi_status(edition) for edition in Edition.objects.filter(ebooks__isnull=False).all()])" + "def editions_to_convert():\n", + " for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct():\n", + " if edition_mobi_status(campaign.edition) == 0: # possible to generate mobi\n", + " yield campaign.edition\n", + " \n", + " \n", + "list(editions_to_convert())" ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 8, + "text": [ + "[,\n", + " ,\n", + " ]" + ] + } + ], + "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ - "# looking only at Campaign related ebooks\n", + "# http://127.0.0.1:8000/work/138133/ --> The Global Librarian\n", "\n", - "from regluit.core.models import Campaign\n", - "\n", - "Counter([edition_mobi_status(campaign.edition) for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct()])" + "from itertools import islice\n", + "edition = list(islice(editions_to_convert(),1))[0]\n", + "edition.work.ebooks(), edition.work.id" ], "language": "python", "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import requests\n", - "r = requests.get(\"https://archive.org/download/Feeding_the_City/9781909254039_Feeding_the_City.epub\", verify=False)" + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 9, + "text": [ + "([], 138133L)" + ] + } ], - "language": "python", - "metadata": {}, - "outputs": [] + "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ + "# sample code to use convert_to_mobi \n", + "# write output to file system\n", + "\n", "from regluit.core.mobigen import convert_to_mobi\n", "\n", "output = convert_to_mobi(\"https://archive.org/download/Feeding_the_City/9781909254039_Feeding_the_City.epub\")\n", @@ -87,34 +107,6 @@ "metadata": {}, "outputs": [] }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!ls -lt /Users/raymondyee/Downloads/test.mobi" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# let's remind ourselves about how to write to S3 and integrate an ebook record\n", - "# https://github.com/Gluejar/regluit/blob/36793b0b118fd97b52ab0d8637e8e34ab6d8672e/core/models.py#L1776\n", - "\n", - "#ebf holding the books we generate in the watermarking/custom messaging process, right?\n", - "\n", - "from regluit.core.models import EbookFile\n", - "\n", - "for ebookfile in EbookFile.objects.all():\n", - " print (ebookfile, ebookfile.file, ebookfile.edition.title)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, { "cell_type": "code", "collapsed": false, @@ -127,61 +119,6 @@ "metadata": {}, "outputs": [] }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct():\n", - " #print (campaign.edition.title, edition_mobi_status(campaign.edition))\n", - " if edition_mobi_status(campaign.edition) == 0: # possible to generate mobi\n", - " print(campaign.edition.title, campaign.edition.ebooks.filter(format='epub')[0].url)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pledge, b2u, t4u" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "c = Campaign.objects.all()[0]\n", - "c.edition.ebooks.all(), c.type" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "Ebook.objects.filter(format='epub').filter(edition__id=202594)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from regluit.core.models import Identifier\n", - "Identifier.objects.filter(edition__isnull=False).filter(~Q(edition__work__id = F('work__id'))).count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, { "cell_type": "heading", "level": 1, @@ -222,12 +159,12 @@ "collapsed": false, "input": [ "from django.core.files.storage import default_storage\n", - "from django.core.files.base import ContentFile, File\n", - "#from django.core.cache import cache" + "from django.core.files.base import ContentFile, File" ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [], + "prompt_number": 10 }, { "cell_type": "code", @@ -237,29 +174,17 @@ ], "language": "python", "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# but I can't use exists to test existence of S3 folder\n", - "default_storage.exists('ebf'), default_storage.exists('/ebf'), default_storage.exists('/ebf/')" + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 20, + "text": [ + "([u'ebf', u'Users', u'doab', u'marc_test'], [u'storage_test'])" + ] + } ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# can test existence of files, but not appa\n", - "default_storage.exists(\"/Users/rdhyee/covers/52/AWizardOfEarthsea(1stEd).jpg\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] + "prompt_number": 20 }, { "cell_type": "code", @@ -273,35 +198,12 @@ "outputs": [] }, { - "cell_type": "code", - "collapsed": false, - "input": [ - "k = default_storage.bucket.get_all_keys()[0]" - ], - "language": "python", + "cell_type": "heading", + "level": 1, "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "k.url" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# I was expecting true\n", - "default_storage.exists('ebf')" - ], - "language": "python", - "metadata": {}, - "outputs": [] + "source": [ + "Sample code for using default_storage" + ] }, { "cell_type": "code", @@ -364,7 +266,7 @@ "cell_type": "code", "collapsed": false, "input": [ - "# clean up some files\n", + "# clean up some files \n", "\n", "print (default_storage.bucket)\n", "\n", @@ -404,28 +306,18 @@ ], "language": "python", "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "md_book = open(\"/Users/raymondyee/Downloads/hello.mobi\", \"rb\")\n", - "md_s3 = File(md_book)\n", - "md_s3.content_type = \"application/x-mobipocket-ebook\"\n", - "\n", - "default_storage.save(\"/ebf/hello.mobi\", md_s3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] + "outputs": [], + "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "def write_file_to_storage(file_object, content_type, path):\n", - " file_s3 = File(file_object)\n", + " \"\"\"\n", + " write file_object to the default_storage at given path\n", + " \"\"\"\n", + " file_s3 = ContentFile(file_object)\n", " file_s3.content_type = content_type\n", " \n", " default_storage.save(path, file_s3)\n", @@ -433,20 +325,24 @@ ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [], + "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ - "file_ = write_file_to_storage(open(\"/Users/raymondyee/Downloads/hello.mobi\"), \n", + "import uuid\n", + "\n", + "file_ = write_file_to_storage(open(\"/Users/raymondyee/Downloads/hello.mobi\").read(), \n", " \"application/x-mobipocket-ebook\", \n", - " \"/ebf/hello.mobi\")\n", + " \"/ebf/{0}.mobi\".format(uuid.uuid4().get_hex()))\n", "\n" ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [], + "prompt_number": 14 }, { "cell_type": "code", @@ -456,7 +352,17 @@ ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 15, + "text": [ + "'/ebf/304dbd385e384e6cbe9fdec019004b69.mobi'" + ] + } + ], + "prompt_number": 15 }, { "cell_type": "code", @@ -466,7 +372,17 @@ ], "language": "python", "metadata": {}, - "outputs": [] + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 16, + "text": [ + "'https://ry-dev-unglueit.s3.amazonaws.com/ebf/304dbd385e384e6cbe9fdec019004b69.mobi'" + ] + } + ], + "prompt_number": 16 }, { "cell_type": "heading", @@ -487,24 +403,18 @@ ], "language": "python", "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# generator for editions to add mobi to\n", - "\n", - "def editions_to_convert():\n", - " for campaign in Campaign.objects.filter(edition__ebooks__isnull=False).distinct():\n", - " #print (campaign.edition.title, edition_mobi_status(campaign.edition))\n", - " if edition_mobi_status(campaign.edition) == 0: # possible to generate mobi\n", - " yield campaign.edition\n", - " " + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "(u'The Global Librarian', u'https://unglueit-files.s3.amazonaws.com/ebf/619c98c3192c695caabdce71766e7245.epub')\n", + "(u'Option Libre', u'https://unglueit-files.s3.amazonaws.com/ebf/830cac2e0b26dfe576e6658623f6243a.epub')\n", + "(u'Libres conseils. Ce que nous aurions aim\\xe9 savoir avant de commencer', u'https://unglueit-files.s3.amazonaws.com/ebf/f35b38527140a26cf44aa37bf540f24f.epub')\n" + ] + } ], - "language": "python", - "metadata": {}, - "outputs": [] + "prompt_number": 17 }, { "cell_type": "code", @@ -514,41 +424,53 @@ ], "language": "python", "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import uuid" + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 18, + "text": [ + "[,\n", + " ,\n", + " ]" + ] + } ], - "language": "python", - "metadata": {}, - "outputs": [] + "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "from itertools import islice\n", + "from StringIO import StringIO\n", + "\n", + "from regluit.core.mobigen import convert_to_mobi\n", + "\n", + "import uuid\n", + "\n", "for edition in islice(editions_to_convert(),1):\n", - " print (edition)\n", + " print (edition, edition.work.id)\n", " \n", " # pull out the sister edition to convert from\n", - " sister_ebook = edition.ebooks.filter(format__in=['epub', 'pdf'])[0]\n", + " sister_ebook = edition.ebooks.filter(format__in=['epub', 'html'])[0]\n", " \n", " # run the conversion process\n", - " # output = StringIO(convert_to_mobi(sister_ebook.url))\n", - " output = open(\"/Users/raymondyee/Downloads/hello.mobi\")\n", + " \n", + " print(sister_ebook.url)\n", + "\n", + " #output = convert_to_mobi(sister_ebook.url)\n", + " output = open(\"/Users/raymondyee/Downloads/hello.mobi\").read()\n", + " \n", " file_ = write_file_to_storage(output, \n", " \"application/x-mobipocket-ebook\", \n", - " \"/ebf/hello.mobi\")\n", + " \"/ebf/{0}.mobi\".format(uuid.uuid4().get_hex()))\n", " \n", - " # create a path for the ebookfile: \n", + " # create a path for the ebookfile: IS THIS NECESSARY?\n", " # https://github.com/Gluejar/regluit/blob/25dcb06f464dc11b5e589ab6859dfcc487f8f3ef/core/models.py#L1771\n", " \n", - " ebfile = EbookFile(edition=edition, file=file_, format='mobi')\n", - " ebfile.save()\n", + " #ebfile = EbookFile(edition=edition, file=file_, format='mobi')\n", + " #ebfile.save()\n", "\n", " # maybe need to create an ebook pointing to ebookFile ?\n", " # copy metadata from sister ebook\n", @@ -565,19 +487,48 @@ ], "language": "python", "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# from django.db.models import Q\n", - "# Item.objects.filter(creator__in=creators)\n", - "edition.ebooks.filter(format__in=['epub', 'pdf'])[0]" + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "(, 140699L)\n", + "https://unglueit-files.s3.amazonaws.com/ebf/830cac2e0b26dfe576e6658623f6243a.epub\n", + "send:" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " 'HEAD /ebf/d2a010ffe36a12ebba58f3721db814ad.mobi HTTP/1.1\\r\\nHost: ry-dev-unglueit.s3.amazonaws.com\\r\\nAccept-Encoding: identity\\r\\nDate: Wed, 17 Dec 2014 15:57:56 GMT\\r\\nContent-Length: 0\\r\\nAuthorization: AWS AKIAINIMDFN7LAY3WWKA:3Qb/Xh2ukKpn756OFfv42sKuduw=\\r\\nUser-Agent: Boto/2.8.0 (darwin)\\r\\n\\r\\n'\n", + "reply:" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " 'HTTP/1.1 404 Not Found\\r\\n'\n", + "header: x-amz-request-id: 5C64AFB873D76C27\r\n", + "header: x-amz-id-2: BlGzZrubjkmomUW141X0CnBxSMqNQx4ty1+tX8wof/Kb8l1PRyLWjTn39RaEpYbF5IzFR7tfCGc=\r\n", + "header: Content-Type: application/xml\r\n", + "header: Transfer-Encoding: chunked\r\n", + "header: Date: Wed, 17 Dec 2014 15:57:55 GMT\r\n", + "header: Server: AmazonS3\r\n", + "send: 'PUT /ebf/d2a010ffe36a12ebba58f3721db814ad.mobi HTTP/1.1\\r\\nHost: ry-dev-unglueit.s3.amazonaws.com\\r\\nAccept-Encoding: identity\\r\\nContent-MD5: LLhBofTBmSbRkVZFGkZDww==\\r\\nContent-Length: 8749347\\r\\nExpect: 100-Continue\\r\\nDate: Wed, 17 Dec 2014 15:57:56 GMT\\r\\nUser-Agent: Boto/2.8.0 (darwin)\\r\\nContent-Type: application/x-mobipocket-ebook\\r\\nAuthorization: AWS AKIAINIMDFN7LAY3WWKA:+HaOsbKJd1IWoC8fELOTeuTIRWM=\\r\\nx-amz-acl: public-read\\r\\n\\r\\n'\n", + "https://ry-dev-unglueit.s3.amazonaws.com/ebf/1549a778ca974aea89bc125b83e52dcc.mobi" + ] + }, + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\n" + ] + } ], - "language": "python", - "metadata": {}, - "outputs": [] + "prompt_number": 22 }, { "cell_type": "code", @@ -621,78 +572,6 @@ "language": "python", "metadata": {}, "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "edition.ebooks.all()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "edition.ebook_files.all()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ebf.save()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ebf.id" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ebf.edition.ebooks.all()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "edition_mobi_status(edition)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "\n", - "\n", - "ebf1= EbookFile(edition=e1, file=file_, format='mobi')" - ], - "language": "python", - "metadata": {}, - "outputs": [] } ], "metadata": {}