Merge branch 'production'

pull/1/head
Raymond Yee 2012-02-24 14:56:45 -08:00
commit 538d225ad0
5 changed files with 345 additions and 64 deletions

View File

@ -636,6 +636,7 @@ def add_missing_isbn_to_editions(max_num=None, confirm=False):
'no_isbn_found': no_isbn_found,
'editions_to_merge': editions_to_merge,
'exceptions': exceptions,
'google_id_not_found': google_id_not_found,
'confirm': ok
}

View File

@ -0,0 +1,34 @@
"""
Dispose of the Frankenworks and recluster the works. Print out email addresses of those whose wishlists have been
affected.
"""
from django.core.management.base import BaseCommand
from regluit.test import booktests
class Command(BaseCommand):
help = "Dispose of the Frankenworks and recluster the works. Print out email addresses of those whose wishlists have been affected."
args = "<do>"
def handle(self, do, **options):
try:
do = str(do)
if do.lower() == 'true':
do = True
else:
do = False
except:
do = False
print "before..."
s = booktests.cluster_status()
print s['results']
booktests.clean_frankenworks(s, do=do)
s = booktests.cluster_status()
print "after cleanup...."
print "results ", s['results']
print "scattered clusters ", s['scattered_clusters']
print "franken works", s['franken_works']

View File

@ -11,6 +11,7 @@ import random
random.seed()
import sys, os
import json
# a kludge to allow for isbn.py to be imported
# and not just in the context of the regluit Django app
@ -436,15 +437,14 @@ class FreebaseBooks(object):
self.freebase.login(username,password)
def books(self):
MQL = u"""[{
"type": "/book/book",
"id": null,
"key": [{
"namespace": "/wikipedia/en",
"value": null,
"type": "/type/key"
}]
}]
""".replace("\n"," ")
"type": "/book/book",
"id": null,
"key": [{
"namespace": "/wikipedia/en",
"value": null,
"type": "/type/key"
}]
}]""".replace("\n"," ")
query = json.loads(MQL)
resp = self.freebase.mqlreaditer(query)
for r in resp:
@ -452,18 +452,17 @@ class FreebaseBooks(object):
def book_editions(self):
MQL = u"""[{
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}}]""".replace("\n"," ")
query = json.loads(MQL)
resp = self.freebase.mqlreaditer(query)
for r in resp:
@ -471,18 +470,17 @@ class FreebaseBooks(object):
def editions_for_book(self, book_id):
MQL = u"""[{
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}}]""".replace("\n"," ")
query = json.loads(MQL)
query[0]["book"]["id"] = book_id
resp = self.freebase.mqlreaditer(query)
@ -491,18 +489,17 @@ class FreebaseBooks(object):
def book_edition_by_id(self,id,id_type):
MQL = u"""[{
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"id": null,
"isbn": [{}],
"ISBN": [{}],
"LCCN": [{}],
"OCLC_number": [{}],
"openlibrary_id": [{}],
"book": {
"id": null,
"name": null
}}]""".replace("\n"," ")
query = json.loads(MQL)
if id_type == 'isbn':
query[0][id_type][0].setdefault('name', id)
@ -526,18 +523,18 @@ class FreebaseBooks(object):
elif isbn_val is not None:
isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
MQL = """[{
"type": "/book/book_edition",
"isbn": {
"name": null
},
"book": {
"editions": [{
"isbn": {
"name": null
}
}]
}
}]""".replace("\n"," ")
"type": "/book/book_edition",
"isbn": {
"name": null
},
"book": {
"editions": [{
"isbn": {
"name": null
}
}]
}
}]""".replace("\n"," ")
query = json.loads(MQL)
query[0]["book"]["editions"][0]["isbn"]["name"] = isbn_val
resp = self.freebase.mqlreaditer(query)
@ -565,7 +562,91 @@ class WorkMapper(object):
yield work_id
if not complete_search:
raise StopIteration()
class LibraryThing(object):
"""
Provide cached access to thingisbn and LT whatwork interface. Allow for a cache file to be loaded and saved
"""
def __init__(self, fname=None):
self.__isbn_to_work_id = {}
self.fname = fname
def __del__(self):
self.save()
def thingisbn(self, isbn, return_work_id=False):
""" if return_work_id is True, we won't try to calculate all the relevant isbns"""
# first, normalize the isbn
isbn = isbn_mod.ISBN(isbn).to_string('13')
if isbn is None: return []
# check to see whether we have isbn already
if isbn in self.__isbn_to_work_id:
# return all isbns with the work id
# print "%s already cached" % (isbn)
work_id = self.__isbn_to_work_id.get(isbn)
if return_work_id:
return work_id
if work_id is not None:
return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
else:
return []
else:
# if isbn is not already cached, do look up and cache the results and return the results
print "calling thingisbn for %s" % (isbn)
results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
if len(results):
# look up the librarything work id
work_id = self.whatwork(isbn)
if work_id is not None: # which should be the case since results is not zero-length
self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
else:
logger.exception("work_id should not be None for isbn %s", isbn)
return []
else:
self.__isbn_to_work_id[isbn] = None # mark as not recognized by LT
work_id = None
if return_work_id:
return work_id
else:
return results
def whatwork(self, isbn=None, title=None, author=None):
# if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
# first, normalize the isbn
isbn = isbn_mod.ISBN(isbn).to_string('13')
if isbn is not None and (title is None and author is None):
if isbn in self.__isbn_to_work_id:
work_id = self.__isbn_to_work_id.get(isbn)
else:
work_id = lt_whatwork(isbn=isbn)
self.__isbn_to_work_id[isbn] = work_id
return work_id
else:
return lt_whatwork(isbn=isbn, title=title, author=author)
def load(self):
try:
f = open(self.fname)
input_data = json.load(f)
f.close()
if isinstance(input_data, dict):
self.__isbn_to_work_id = input_data
return True
else:
return False
except Exception, e:
print e
def save(self):
if self.fname is not None:
f = open(self.fname, "w")
json.dump(self.__isbn_to_work_id, f)
f.close()
return True
else:
return False
def look_up_my_zotero_books_in_hathi():
from regluit.experimental.zotero_books import MyZotero
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
self.assertEqual(work_id, SURFACING_LT_WORK_ID)
work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
self.assertEqual(work_id, '2199')
def test_cache(self):
lt = LibraryThing()
res = lt.thingisbn(SURFACING_ISBN)
res2 = lt.thingisbn(SURFACING_ISBN)
self.assertEqual(set(res), set(res2))
self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
def suite():
@ -793,7 +885,7 @@ def suite():
#testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
testcases = []
suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
suites.addTest(LibraryThingTest('test_whatwork'))
suites.addTest(LibraryThingTest('test_cache'))
#suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment
return suites

Binary file not shown.

View File

@ -1,12 +1,15 @@
from regluit.core import librarything, bookloader, models, tasks
from collections import OrderedDict
from itertools import izip, islice
from collections import OrderedDict, defaultdict, namedtuple
from itertools import izip, islice, repeat
import django
from django.db.models import Q, F
from regluit.core import bookloader
from django.contrib.comments.models import Comment
import warnings
import datetime
from regluit import experimental
from regluit.experimental import bookdata
from datetime import datetime
import json
@ -14,6 +17,20 @@ import json
import logging
logger = logging.getLogger(__name__)
def dictset(itertuple):
s = defaultdict(set)
for (k, v) in itertuple:
s[k].add(v)
return s
def dictlist(itertuple):
d = defaultdict(list)
for (k, v) in itertuple:
d[k].append(v)
return d
EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'ed_created', 'work_id', 'work_created', 'lang'])
def ry_lt_books():
"""return parsing of rdhyee's LibraryThing collection"""
lt = librarything.LibraryThing('rdhyee')
@ -85,26 +102,163 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
else:
logger.info("%d null seed_isbn: ebook %s", i, ebook)
def cluster_status():
def cluster_status(max_num=None):
"""Look at the current Work, Edition instances to figure out what needs to be fixed"""
results = OrderedDict([
('number of Works', models.Work.objects.count()),
('number of Editions', models.Edition.objects.count())
('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
('number of Editions', models.Edition.objects.count()),
('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
('number of Edition that have both Google Books id and ISBNs',
models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
('number of Editions with Google Books IDs but not ISBNs',
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
])
# What needs to be done to recluster editions?
# models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
# 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
# a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
# [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?
# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
# an ISBN tied to that Google Books ID)
import shutil
import time
import operator
return results
# let's form a key to map all the Editions into
# (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
work_clusters = defaultdict(set)
current_map = defaultdict(set)
#backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
backup = '{0}/lt_data_back.json'.format(experimental.__path__[0])
#fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
fname = '{0}/lt_data.json'.format(experimental.__path__[0])
shutil.copy(fname, backup)
lt = bookdata.LibraryThing(fname)
try:
input_file = open(fname, "r")
success = lt.load()
print "success: %s" % (success)
input_file.close()
except Exception, e:
print e
for (i, (isbn, ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
'edition__title', 'edition__created', 'edition__work__id',
'edition__work__created', 'edition__work__language'), max_num)):
lt_work_id = lt.thingisbn(isbn, return_work_id=True)
key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
print i, isbn, lt_work_id, key
work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
work_id=work_id, work_created=work_created, lang=lang))
current_map[work_id].add(key)
lt.save()
# Now add the Editions without any ISBNs
print "editions w/o isbn"
for (i, (ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
'title', 'created', 'work__id', 'work__created', 'work__language' ), None)):
key = (None, lang, None, ed_id)
print i, ed_id, ed_title.encode('ascii','ignore'), key
work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
work_id=work_id, work_created=work_created, lang=lang))
current_map[work_id].add(key)
print "number of clusters", len(work_clusters)
# all unglue.it Works that contain Editions belonging to more than one newly calculated cluster are "FrankenWorks"
franken_works = sorted([k for (k,v) in current_map.items() if len(v) > 1])
# let's calculate the list of users affected if delete the Frankenworks, the number of works deleted from their wishlist
# specifically a list of emails to send out
affected_works = [models.Work.objects.get(id=w_id) for w_id in franken_works]
affected_wishlists = set(reduce(operator.add, [list(w.wishlists.all()) for w in affected_works])) if len(affected_works) else set()
affected_emails = [w.user.email for w in affected_wishlists]
affected_editions = reduce(operator.add, [list(w.editions.all()) for w in affected_works]) if len(affected_works) else []
# calculate the Comments that would have to be deleted too.
affected_comments = reduce(operator.add, [list(Comment.objects.for_model(w)) for w in affected_works]) if len(affected_works) else []
# calculate the inverse of work_clusters
wcp = dict(reduce(operator.add, [ list( izip([ed.ed_id for ed in eds], repeat(k))) for (k,eds) in work_clusters.items()]))
# (I'm not completely sure of this calc -- but the datetime of the latest franken-event)
latest_franken_event = max([ max([min(map(lambda x: x[1], v)) for v in dictlist([(wcp[ed["id"]], (ed["id"], ed["created"].isoformat()))
for ed in models.Work.objects.get(id=w_id).editions.values('id', 'created')]).values()])
for w_id in franken_works]) if len(franken_works) else None
scattered_clusters = [(k, len(set(([e.work_id for e in v])))) for (k,v) in work_clusters.items() if len(set(([e.work_id for e in v]))) <> 1 ]
s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results, 'franken_works': franken_works,
'wcp':wcp, 'latest_franken_event': latest_franken_event, 'affected_works':affected_works,
'affected_comments': affected_comments, 'scattered_clusters': scattered_clusters,
'affected_emails': affected_emails}
return s
def clean_frankenworks(s, do=False):
# list out the email addresses of accounts with wishlists to be affected
print "number of email addresses: ", len(s['affected_emails'])
print ", ".join(s['affected_emails'])
# list the works we delete
print "number of FrankenWorks", len(s['franken_works'])
print s['franken_works']
# delete the affected comments
print "deleting comments"
for (i, comment) in enumerate(s['affected_comments']):
print i, "deleting ", comment
if do:
comment.delete()
# delete the Frankenworks
print "deleting Frankenworks"
for (i, work) in enumerate(s['affected_works']):
print i, "deleting ", work.id
if do:
work.delete()
# run reclustering surgically -- calculate a set of ISBNs to feed to bookloader.add_related
# assuming x is a set
popisbn = lambda x: list(x)[0].isbn if len(x) else None
# group scattered_clusters by LT work id
scattered_lt = dictlist([(k[0], k) for (k,v) in s['scattered_clusters']])
isbns = map(popisbn, [s['work_clusters'][k[0]] for k in scattered_lt.values()])
print "running bookloader"
for (i, isbn) in enumerate(isbns):
print i, isbn
if do:
bookloader.add_related(isbn)