The notebook is a bit out of date but I want to put it under git control before updating it.
parent
2fac485f08
commit
fbc6c61587
|
@ -0,0 +1,278 @@
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"name": "",
|
||||||
|
"signature": "sha256:2b951eb591ac0924fe8694712aba3bc6e6af53bb70dbd53908789042963d28ec"
|
||||||
|
},
|
||||||
|
"nbformat": 3,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"worksheets": [
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"http://doabooks.org/doab?func=about&uiLanguage=en#metadata\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"import regluit\n",
|
||||||
|
"from regluit.core.bookloader import (add_by_isbn, \n",
|
||||||
|
" get_google_isbn_results)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"It's going to be a bit complicated, it seems. Trying to learn from how I loaded Gutenberg books: https://github.com/Gluejar/regluit/blob/master/core/bookloader.py#L629\n",
|
||||||
|
"\n",
|
||||||
|
"* add doab id?\n",
|
||||||
|
"\n",
|
||||||
|
"I need to remind myself: how does clustering happen on unglue.it?"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"# sister_edition = add_by_isbn(seed_isbn)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"get_google_isbn_results('9781847881298')"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"import regluit\n",
|
||||||
|
"from regluit.core import (models,tasks)\n",
|
||||||
|
"from regluit.core.bookloader import add_by_isbn\n",
|
||||||
|
"\n",
|
||||||
|
"def load_doab_edition(title, doab_id, seed_isbn, url, format, rights, \n",
|
||||||
|
" provider='Directory of Open Access Books'):\n",
|
||||||
|
" \n",
|
||||||
|
" # can we find doab_id as an identifier? \n",
|
||||||
|
" # doab work or edition id\n",
|
||||||
|
" \n",
|
||||||
|
" try:\n",
|
||||||
|
" work = models.Identifier.objects.get(type='doab',value=doab_id).work\n",
|
||||||
|
" except models.Identifier.DoesNotExist: # try to find an Edition with the seed_isbn and use that work to hang off of\n",
|
||||||
|
" sister_edition = add_by_isbn(seed_isbn)\n",
|
||||||
|
" if sister_edition.new:\n",
|
||||||
|
" # add related editions asynchronously\n",
|
||||||
|
" tasks.populate_edition.delay(sister_edition.isbn_13)\n",
|
||||||
|
" work = sister_edition.work\n",
|
||||||
|
" # attach the olwk identifier to this work if it's not none.\n",
|
||||||
|
" if doab_id is not None:\n",
|
||||||
|
" work_id = models.Identifier.get_or_add(type='doab',value=doab_id, \n",
|
||||||
|
" work=work,\n",
|
||||||
|
" edition=sister_edition)\n",
|
||||||
|
"\n",
|
||||||
|
" # Now pull out any existing DOAB editions tied to the work with the proper DOAB ID\n",
|
||||||
|
" try:\n",
|
||||||
|
" edition = models.Identifier.objects.get( type='doab', value=doab_id).edition \n",
|
||||||
|
" except models.Identifier.DoesNotExist:\n",
|
||||||
|
" edition = models.Edition()\n",
|
||||||
|
" edition.title = title\n",
|
||||||
|
" edition.work = work\n",
|
||||||
|
" \n",
|
||||||
|
" edition.save()\n",
|
||||||
|
" edition_id = models.Identifier.get_or_add(type='doab',value=doab_id, \n",
|
||||||
|
" edition=edition, work=work)\n",
|
||||||
|
" \n",
|
||||||
|
" # check to see whether the Edition hasn't already been loaded first\n",
|
||||||
|
" # search by url\n",
|
||||||
|
" ebooks = models.Ebook.objects.filter(url=url)\n",
|
||||||
|
" \n",
|
||||||
|
" # format: what's the controlled vocab? -- from Google -- alternative would be mimetype\n",
|
||||||
|
" \n",
|
||||||
|
" if len(ebooks): \n",
|
||||||
|
" ebook = ebooks[0]\n",
|
||||||
|
" elif len(ebooks) == 0: # need to create new ebook\n",
|
||||||
|
" ebook = models.Ebook()\n",
|
||||||
|
"\n",
|
||||||
|
" if len(ebooks) > 1:\n",
|
||||||
|
" warnings.warn(\"There is more than one Ebook matching url {0}\".format(url))\n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" ebook.format = format\n",
|
||||||
|
" ebook.provider = provider\n",
|
||||||
|
" ebook.url = url\n",
|
||||||
|
" ebook.rights = rights\n",
|
||||||
|
" \n",
|
||||||
|
" # is an Ebook instantiable without a corresponding Edition? (No, I think)\n",
|
||||||
|
" \n",
|
||||||
|
" ebook.edition = edition\n",
|
||||||
|
" ebook.save()\n",
|
||||||
|
" \n",
|
||||||
|
" return ebook"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"# let's do one by hand\n",
|
||||||
|
"\n",
|
||||||
|
"book1 = {'doab_id': u'12555',\n",
|
||||||
|
" 'format': 'pdf',\n",
|
||||||
|
" 'rights': u'CC BY-NC-ND',\n",
|
||||||
|
" 'seed_isbn': u'9781847881298',\n",
|
||||||
|
" 'title': u'Jihad Beyond Islam',\n",
|
||||||
|
" 'url': 'http://www.oapen.org/download?type=document&docid=390768'}\n",
|
||||||
|
"\n",
|
||||||
|
"edition = load_doab_edition(**book1)"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"df4.urls"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "heading",
|
||||||
|
"level": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Loading the list of books"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"import json\n",
|
||||||
|
"s = open(\"/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/notebooks/doab.json\").read()\n",
|
||||||
|
"\n",
|
||||||
|
"records = json.loads(s)\n",
|
||||||
|
"records[:5]"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"from itertools import islice\n",
|
||||||
|
"\n",
|
||||||
|
"for (i, book) in enumerate(islice(records,30)):\n",
|
||||||
|
" print i, \n",
|
||||||
|
" d = dict(book)\n",
|
||||||
|
" if d['format'] == 'pdf':\n",
|
||||||
|
" try:\n",
|
||||||
|
" edition = load_doab_edition(**dict(book))\n",
|
||||||
|
" print \"success\"\n",
|
||||||
|
" except Exception, e:\n",
|
||||||
|
" print e\n",
|
||||||
|
" else:\n",
|
||||||
|
" print \"non-pdf\"\n",
|
||||||
|
" "
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "heading",
|
||||||
|
"level": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Using regluit.core.doab functions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"from regluit.core import doab\n"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"DOAB_DATA_FILE = \"/Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data-2014/notebooks/doab.json\"\n",
|
||||||
|
"doab.load_doab_records(DOAB_DATA_FILE, limit=50, async=True)\n"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"!tail logs/unglue.it.log"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"import os"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"os.path.exists(os.path.join(\"/Users/raymondyee/C/src/Gluejar/regluit/core/management/commands\", \n",
|
||||||
|
" \"../../../bookdata/doab.json\"))"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
Loading…
Reference in New Issue