regluit/notebooks/doab_loading.ipynb

595 lines
20 KiB
Plaintext
Raw Normal View History

{
"metadata": {
"name": "",
"signature": "sha256:c124c7b321845a9c3a2e8dd2fa376f604a42da2230625a4beae73249e4329621"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Checking the results of a local celery task"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core import tasks\n",
"\n",
"task_id = \"c7c6c4a9-c9bf-4881-8800-ecea8e365655\"\n",
"result = tasks.fac.AsyncResult(task_id)\n",
"result.get()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"http://doabooks.org/doab?func=about&uiLanguage=en#metadata\n"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Loading the list of books"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"import codecs\n",
"s = codecs.open(\"../bookdata/doab.json\", encoding='UTF-8').read()\n",
"records = json.loads(s)\n",
"records[:1]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# how many PDFs are there to load?\n",
"\n",
"pdf_records = [record for record in records if dict(record).get('format') == 'pdf']\n",
"len(pdf_records)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from collections import Counter\n",
"\n",
"# doab_ids unique for the PDFs?\n",
"[c for c in Counter([dict(r).get('doab_id') for r in pdf_records]).items() if c[1] > 1]\n",
"\n",
"# 2 of the doab records have more than 1 pdf\n",
"# doab_id of 15968 and 15969 "
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# new algorithm\n",
"\n",
" for each doab_id:\n",
" for each isbn,\n",
" try to find a google_id for the isbn (by add_by_isbn), otherwise create our own edition\n",
" (asynchronously), populate related isbns for each of the isbns\n",
" for each of the works associated with this list of editions:\n",
" we can manually run merge_work on all of them pairwise\n",
"\n",
" when we should end up w/ one work per doab_id --> tie that doab_id the super-merged work."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def load_doab_edition(title, doab_id, seed_isbn, url, format, rights,\n",
" language, isbns,\n",
" provider='Directory of Open Access Books', **kwargs):\n",
" \n",
"\n",
" from django.db.models import (Q, F)\n",
" \n",
" from regluit.core import tasks\n",
" from regluit.core import (models, bookloader)\n",
" \n",
" # check to see whether the Edition hasn't already been loaded first\n",
" # search by url\n",
" ebooks = models.Ebook.objects.filter(url=url)\n",
" \n",
" # 1 match\n",
" # > 1 match\n",
" # 0 match\n",
"\n",
" # simplest case -- if match (1 or more), we could check whether any\n",
" # ebook.edition.work has a doab id matching given doab_id\n",
" \n",
" # put a migration to force Ebook.url to be unique id\n",
" \n",
" # if yes, then return one of the Edition(s) whose work is doab_id\n",
" # if no, then \n",
" \n",
" if len(ebooks) > 1:\n",
" raise Exception(\"There is more than one Ebook matching url {0}\".format(url)) \n",
" elif len(ebooks) == 1: \n",
" ebook = ebooks[0]\n",
" doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, \n",
" work=ebook.edition.work)\n",
" return ebook\n",
" \n",
" # remaining case --> need to create a new Ebook \n",
" assert len(ebooks) == 0\n",
" \n",
" # make sure we have isbns to work with before creating ebook\n",
" if len(isbns) == 0:\n",
" return None\n",
" \n",
" ebook = models.Ebook()\n",
" ebook.format = format\n",
" ebook.provider = provider\n",
" ebook.url = url\n",
" ebook.rights = rights\n",
"\n",
" # we still need to find the right Edition/Work to tie Ebook to...\n",
" \n",
" # look for the Edition with which to associate ebook.\n",
" # loop through the isbns to see whether we get one that is not None\n",
" \n",
" for isbn in isbns:\n",
" edition = bookloader.add_by_isbn(isbn)\n",
" if edition is not None: break \n",
" \n",
" if edition is not None:\n",
" # if this is a new edition, then add related editions asynchronously\n",
" if getattr(edition,'new', False):\n",
" tasks.populate_edition.delay(edition.isbn_13)\n",
" \n",
" # QUESTION: Is this good enough?\n",
" # what's going to happen to edition.work if there's merging \n",
" doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, \n",
" work=edition.work)\n",
"\n",
" # we need to create Edition(s) de novo \n",
" else: \n",
" # if there is a Work with doab_id already, attach any new Edition(s)\n",
" try:\n",
" work = models.Identifier.objects.get(type='doab',value=doab_id).work\n",
" except models.Identifier.DoesNotExist:\n",
" work = models.Work(language=language,title=title)\n",
" work.save()\n",
" doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, \n",
" work=work)\n",
" \n",
" \n",
" # create Edition(s) for each of the isbn from the input info\n",
" editions = []\n",
" for isbn in isbns:\n",
" edition = models.Edition(title=title, work=work)\n",
" edition.save()\n",
" \n",
" isbn_id = models.Identifier.get_or_add(type='isbn',value=isbn,work=work)\n",
" \n",
" editions.append(edition)\n",
" \n",
" # if work has any ebooks already, attach the ebook to the corresponding edition\n",
" # otherwise pick the first one\n",
" # pick the first edition as the one to tie ebook to \n",
" editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \\\n",
" Q(ebooks__isnull=False)).distinct()\n",
" if editions_with_ebooks:\n",
" edition = editions_with_ebooks[0]\n",
" else:\n",
" edition = editions[0]\n",
" \n",
" # tie the edition to ebook\n",
" \n",
" ebook.edition = edition\n",
" ebook.save()\n",
" \n",
" return ebook"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# load 1 record\n",
"book = records_to_load[33]\n",
"\n",
"ebook = load_doab_edition(**dict(book))\n",
"ebook.id, ebook.edition.id, ebook.edition.work.id"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from itertools import islice\n",
"\n",
"import json\n",
"import codecs\n",
"\n",
"loading_problems = []\n",
"\n",
"s = codecs.open(\"../bookdata/doab.json\", encoding='UTF-8').read()\n",
"records = json.loads(s)\n",
"\n",
"# filter out the pdf records\n",
"\n",
"pdf_records = [record for record in records if dict(record).get('format') == 'pdf']\n",
"records_to_load = list(islice(pdf_records,None))\n",
"\n",
"for (i, book) in enumerate(records_to_load):\n",
" print i, \n",
" d = dict(book)\n",
" try:\n",
" edition = load_doab_edition(**dict(book))\n",
" print \"success\"\n",
" except Exception, e:\n",
" loading_problems.append((d, e))\n",
" print e\n",
" "
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I need to remind myself of how to check that there are no outstanding celery jobs after I do this loading. \n",
"\n",
"I have a technique for using `django-celery` monitoring that works on redis (what we use on just and production) -- but not laptop (http://stackoverflow.com/a/5451479/7782). I think a workable way is to look at the celery_taskmeta table."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import djcelery\n",
"[t.status for t in djcelery.models.TaskMeta.objects.all()]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tests for the loading\n",
"\n",
" * can we find all the the URLs?\n",
" * is it associated with the the right doab_id?\n",
" * all the ISBNs loaded?\n",
" * which books are not matched with Google Books IDs -- and therefore might require URLs for covers?\n",
" * did I make sure the edition I'm attaching the ebooks to is the \"selected edition\"?\n",
" * for editions that I create, attach a cover_image from DOAB.\n",
" * all clustered around the same work? (or do I need to do further merging?)\n",
" * are we creating extraneous works?\n",
" * are we loading all the useful metadata? (subject metadata?)\n",
" * is the loading script idempotent?\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## important limit to testing\n",
"\n",
"I have written code to handle the loading of all associated ISBNs with DOAB records -- but we upload only records with non-null licenses, we will have only one ISBN per DOAB record for records with known licenses. So the loading of works for which we know the license won't exercise the code in question:\n",
"https://github.com/Gluejar/regluit/blob/5b3a8d7b1302bc1b1985c675add06c345567a7a1/core/doab.py#L91\n",
"I also checked that there is no intersection of DOAB ids betwen records with known licenses and those that don't."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"work.cover_image_small()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core.models import Work, Edition, Ebook, Identifier\n",
"from regluit.core.isbn import ISBN\n",
"\n",
"tests_exceptions = []\n",
"no_google_book_id = []\n",
"all_problems = []\n",
"\n",
"\n",
"for record in islice(records_to_load, None ):\n",
" d = dict(record)\n",
" ebooks = Ebook.objects.filter(url=d.get('url'))\n",
" \n",
" problems = []\n",
" \n",
" try:\n",
" # check only one ebook with this URL.\n",
" if len(ebooks) != 1:\n",
" problems.append(\"len(ebooks): \" + len(ebooks))\n",
" \n",
" # ebook.edition.work is the work with the doab_id\n",
" if not(ebooks[0].edition.work == Identifier.objects.get(type='doab', \n",
" value=d.get('doab_id')).work):\n",
" problems.append(\"ebook.edition.work is the work with the doab_id\")\n",
" # all the ISBNs loaded?\n",
" # this code might be a bit inefficient given there might only be one isbn per record\n",
" \n",
" isbns = [ISBN(i).to_string() for i in d.get('isbns')]\n",
" if not(set(isbns) == set([id_.value for id_ in Identifier.objects.filter(type=\"isbn\", \n",
" value__in=isbns)])):\n",
" problems.append(\"isbns not matching\")\n",
" \n",
" if problems:\n",
" all_problems.append((d, problems))\n",
" \n",
" # check on presence of Google books id\n",
" if len(ebooks[0].edition.identifiers.filter(type=\"goog\")) < 1:\n",
" no_google_book_id.append(d)\n",
"\n",
" \n",
" except Exception, e:\n",
" tests_exceptions.append((d, e))\n",
" \n",
"print \"all_problems\", all_problems\n",
"print\n",
"print \"tests_exceptions\", tests_exceptions\n",
"print\n",
"print \"no_google_book_id\", no_google_book_id"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# invalid ISBNs?\n",
"\n",
"for (d, p) in all_problems:\n",
" print d['isbns'][0], ISBN(d['isbns'][0]).valid"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"[(d['doab_id'], d['isbns'][0]) for d in no_google_book_id]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# it is possible to do a query for a whole set of values, a technique I might make use of.\n",
"# http://stackoverflow.com/a/9304968\n",
"# e.g., Blog.objects.filter(pk__in=[1,4,7])\n",
"\n",
"urls = [dict(record).get('url') for record in records_to_load]\n",
"set([ebook.url for ebook in Ebook.objects.filter(url__in=urls)]) == set(urls)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Custom Cover Images"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the edition.cover_image? \n",
"\n",
"How to set selected edition? Just set it on the work?: https://github.com/Gluejar/regluit/blob/b675052736f79dcb8d84ddc6349c99fa392fa9bc/core/models.py#L991"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"work = Identifier.objects.get(type='doab', value='12592').work\n",
"edition = work.preferred_edition\n",
"\n",
"work.cover_image_small\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# the code in this cell will write links to doabooks.org\n",
"# http://www.doabooks.org/doab?func=cover&rid=12592\n",
"\n",
"def update_cover_doab(doab_id):\n",
" work = Identifier.objects.get(type='doab', value=doab_id).work\n",
" edition = work.preferred_edition\n",
" edition.cover_image = \"http://www.doabooks.org/doab?func=cover&rid={0}\".format(doab_id)\n",
" edition.save()\n",
" return work"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# NEXT STEP\n",
"# Eric is advocating another approach\n",
"# https://github.com/Gluejar/regluit/pull/363#issuecomment-48373487\n",
"# \"I would retrieve the doab cover and re-serve it.\"\n",
"\n",
"# based on \n",
"# https://github.com/Gluejar/regluit/blob/420cbdf448ef744ed2a2ab45f279f6bdddc14ca8/frontend/views.py#L609\n",
"\n",
"from django.core.files.storage import default_storage\n",
"from StringIO import StringIO\n",
"import requests\n",
"\n",
"def update_cover_doab_2(doab_id):\n",
" work = Identifier.objects.get(type='doab', value=doab_id).work\n",
" edition = work.preferred_edition\n",
"\n",
" url = \"http://www.doabooks.org/doab?func=cover&rid={0}\".format(doab_id)\n",
" r = requests.get(url)\n",
" cover_file = StringIO(r.content)\n",
" \n",
" cover_file_name= '/Users/%s/covers/%s/doab_%s' % (\"RaymondYee\", edition.pk, doab_id)\n",
" file = default_storage.open(cover_file_name, 'w')\n",
" file.write(cover_file.read())\n",
" file.close()\n",
" #and put its url into cover_image\n",
" edition.cover_image = default_storage.url(cover_file_name)\n",
" edition.save()\n",
" return edition\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for d in no_google_book_id:\n",
" work = update_cover_doab(d['doab_id'])\n",
" print work.id, work.cover_image_small()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Adding Subjects"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Code I was working out to use Django querysets to pull out relationships among ebooks, editions, works"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core.models import (Ebook, Edition, Work)\n",
"from django.db.models import (Q, F)\n",
"\n",
"# models.Identifier.objects.filter(edition__isnull=False).filter(~Q(edition__work__id = F('work__id'))).count()\n",
"\n",
"editions_with_ebooks = Edition.objects.filter(ebooks__isnull=False)\n",
"editions_with_ebooks\n",
"\n",
"edition = editions_with_ebooks[0]\n",
"print edition.work_id\n",
"work = edition.work\n",
"print work.editions.all()\n",
"# didn't know you should use distinct()\n",
"Edition.objects.filter(Q(work__id=edition.work_id) & Q(ebooks__isnull=False)).distinct()\n",
"#Edition.objects.filter(Q(work__id=edition.work_id))\n",
"#work.objects.filter(editions__ebooks__isnull=False)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# let me grab ebooks and look at their parent works\n",
"\n",
"from regluit.core.models import Ebook\n",
"\n",
"[eb.edition for eb in Ebook.objects.all()]"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Using regluit.core.doab functions"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from regluit.core import doab\n"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}