Merge branch 'production'
commit
538d225ad0
|
@ -636,6 +636,7 @@ def add_missing_isbn_to_editions(max_num=None, confirm=False):
|
|||
'no_isbn_found': no_isbn_found,
|
||||
'editions_to_merge': editions_to_merge,
|
||||
'exceptions': exceptions,
|
||||
'google_id_not_found': google_id_not_found,
|
||||
'confirm': ok
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
"""
|
||||
Dispose of the Frankenworks and recluster the works. Print out email addresses of those whose wishlists have been
|
||||
affected.
|
||||
"""
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from regluit.test import booktests
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Dispose of the Frankenworks and recluster the works. Print out email addresses of those whose wishlists have been affected."
|
||||
args = "<do>"
|
||||
|
||||
def handle(self, do, **options):
|
||||
|
||||
try:
|
||||
do = str(do)
|
||||
if do.lower() == 'true':
|
||||
do = True
|
||||
else:
|
||||
do = False
|
||||
except:
|
||||
do = False
|
||||
|
||||
print "before..."
|
||||
s = booktests.cluster_status()
|
||||
print s['results']
|
||||
|
||||
booktests.clean_frankenworks(s, do=do)
|
||||
s = booktests.cluster_status()
|
||||
print "after cleanup...."
|
||||
print "results ", s['results']
|
||||
print "scattered clusters ", s['scattered_clusters']
|
||||
print "franken works", s['franken_works']
|
|
@ -11,6 +11,7 @@ import random
|
|||
random.seed()
|
||||
|
||||
import sys, os
|
||||
import json
|
||||
|
||||
# a kludge to allow for isbn.py to be imported
|
||||
# and not just in the context of the regluit Django app
|
||||
|
@ -436,15 +437,14 @@ class FreebaseBooks(object):
|
|||
self.freebase.login(username,password)
|
||||
def books(self):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book",
|
||||
"id": null,
|
||||
"key": [{
|
||||
"namespace": "/wikipedia/en",
|
||||
"value": null,
|
||||
"type": "/type/key"
|
||||
}]
|
||||
}]
|
||||
""".replace("\n"," ")
|
||||
"type": "/book/book",
|
||||
"id": null,
|
||||
"key": [{
|
||||
"namespace": "/wikipedia/en",
|
||||
"value": null,
|
||||
"type": "/type/key"
|
||||
}]
|
||||
}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
for r in resp:
|
||||
|
@ -452,18 +452,17 @@ class FreebaseBooks(object):
|
|||
|
||||
def book_editions(self):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
for r in resp:
|
||||
|
@ -471,18 +470,17 @@ class FreebaseBooks(object):
|
|||
|
||||
def editions_for_book(self, book_id):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
query[0]["book"]["id"] = book_id
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
|
@ -491,18 +489,17 @@ class FreebaseBooks(object):
|
|||
|
||||
def book_edition_by_id(self,id,id_type):
|
||||
MQL = u"""[{
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"id": null,
|
||||
"isbn": [{}],
|
||||
"ISBN": [{}],
|
||||
"LCCN": [{}],
|
||||
"OCLC_number": [{}],
|
||||
"openlibrary_id": [{}],
|
||||
"book": {
|
||||
"id": null,
|
||||
"name": null
|
||||
}}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
if id_type == 'isbn':
|
||||
query[0][id_type][0].setdefault('name', id)
|
||||
|
@ -526,18 +523,18 @@ class FreebaseBooks(object):
|
|||
elif isbn_val is not None:
|
||||
isbn_val = isbn_mod.ISBN(isbn_val).to_string('13')
|
||||
MQL = """[{
|
||||
"type": "/book/book_edition",
|
||||
"isbn": {
|
||||
"name": null
|
||||
},
|
||||
"book": {
|
||||
"editions": [{
|
||||
"isbn": {
|
||||
"name": null
|
||||
}
|
||||
}]
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
"type": "/book/book_edition",
|
||||
"isbn": {
|
||||
"name": null
|
||||
},
|
||||
"book": {
|
||||
"editions": [{
|
||||
"isbn": {
|
||||
"name": null
|
||||
}
|
||||
}]
|
||||
}
|
||||
}]""".replace("\n"," ")
|
||||
query = json.loads(MQL)
|
||||
query[0]["book"]["editions"][0]["isbn"]["name"] = isbn_val
|
||||
resp = self.freebase.mqlreaditer(query)
|
||||
|
@ -565,7 +562,91 @@ class WorkMapper(object):
|
|||
yield work_id
|
||||
if not complete_search:
|
||||
raise StopIteration()
|
||||
|
||||
class LibraryThing(object):
|
||||
"""
|
||||
Provide cached access to thingisbn and LT whatwork interface. Allow for a cache file to be loaded and saved
|
||||
"""
|
||||
def __init__(self, fname=None):
|
||||
self.__isbn_to_work_id = {}
|
||||
self.fname = fname
|
||||
def __del__(self):
|
||||
self.save()
|
||||
def thingisbn(self, isbn, return_work_id=False):
|
||||
""" if return_work_id is True, we won't try to calculate all the relevant isbns"""
|
||||
# first, normalize the isbn
|
||||
isbn = isbn_mod.ISBN(isbn).to_string('13')
|
||||
if isbn is None: return []
|
||||
|
||||
# check to see whether we have isbn already
|
||||
if isbn in self.__isbn_to_work_id:
|
||||
# return all isbns with the work id
|
||||
# print "%s already cached" % (isbn)
|
||||
work_id = self.__isbn_to_work_id.get(isbn)
|
||||
|
||||
if return_work_id:
|
||||
return work_id
|
||||
if work_id is not None:
|
||||
return [k for (k, v) in self.__isbn_to_work_id.items() if v == work_id]
|
||||
else:
|
||||
return []
|
||||
else:
|
||||
# if isbn is not already cached, do look up and cache the results and return the results
|
||||
print "calling thingisbn for %s" % (isbn)
|
||||
results = [isbn_mod.ISBN(k).to_string('13') for k in thingisbn (isbn)]
|
||||
if len(results):
|
||||
# look up the librarything work id
|
||||
work_id = self.whatwork(isbn)
|
||||
|
||||
if work_id is not None: # which should be the case since results is not zero-length
|
||||
self.__isbn_to_work_id.update(dict([(isbn_mod.ISBN(result).to_string('13'), work_id) for result in results]))
|
||||
else:
|
||||
logger.exception("work_id should not be None for isbn %s", isbn)
|
||||
return []
|
||||
else:
|
||||
self.__isbn_to_work_id[isbn] = None # mark as not recognized by LT
|
||||
work_id = None
|
||||
|
||||
if return_work_id:
|
||||
return work_id
|
||||
else:
|
||||
return results
|
||||
|
||||
def whatwork(self, isbn=None, title=None, author=None):
|
||||
# if isbn is not None and title, author None then look up results, otherwise just pass along to lt_whatwork
|
||||
# first, normalize the isbn
|
||||
isbn = isbn_mod.ISBN(isbn).to_string('13')
|
||||
if isbn is not None and (title is None and author is None):
|
||||
if isbn in self.__isbn_to_work_id:
|
||||
work_id = self.__isbn_to_work_id.get(isbn)
|
||||
else:
|
||||
work_id = lt_whatwork(isbn=isbn)
|
||||
self.__isbn_to_work_id[isbn] = work_id
|
||||
return work_id
|
||||
else:
|
||||
return lt_whatwork(isbn=isbn, title=title, author=author)
|
||||
def load(self):
|
||||
try:
|
||||
f = open(self.fname)
|
||||
input_data = json.load(f)
|
||||
f.close()
|
||||
|
||||
if isinstance(input_data, dict):
|
||||
self.__isbn_to_work_id = input_data
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except Exception, e:
|
||||
print e
|
||||
def save(self):
|
||||
if self.fname is not None:
|
||||
f = open(self.fname, "w")
|
||||
json.dump(self.__isbn_to_work_id, f)
|
||||
f.close()
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def look_up_my_zotero_books_in_hathi():
|
||||
from regluit.experimental.zotero_books import MyZotero
|
||||
|
@ -786,6 +867,17 @@ class LibraryThingTest(TestCase):
|
|||
self.assertEqual(work_id, SURFACING_LT_WORK_ID)
|
||||
work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
|
||||
self.assertEqual(work_id, '2199')
|
||||
def test_cache(self):
|
||||
|
||||
lt = LibraryThing()
|
||||
res = lt.thingisbn(SURFACING_ISBN)
|
||||
|
||||
res2 = lt.thingisbn(SURFACING_ISBN)
|
||||
self.assertEqual(set(res), set(res2))
|
||||
|
||||
self.assertEqual(lt.whatwork(SURFACING_ISBN), SURFACING_LT_WORK_ID)
|
||||
|
||||
self.assertEqual(lt.thingisbn(SURFACING_ISBN, return_work_id=True), SURFACING_LT_WORK_ID)
|
||||
|
||||
|
||||
def suite():
|
||||
|
@ -793,7 +885,7 @@ def suite():
|
|||
#testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
|
||||
testcases = []
|
||||
suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
|
||||
suites.addTest(LibraryThingTest('test_whatwork'))
|
||||
suites.addTest(LibraryThingTest('test_cache'))
|
||||
#suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment
|
||||
return suites
|
||||
|
||||
|
|
Binary file not shown.
|
@ -1,12 +1,15 @@
|
|||
from regluit.core import librarything, bookloader, models, tasks
|
||||
from collections import OrderedDict
|
||||
from itertools import izip, islice
|
||||
from collections import OrderedDict, defaultdict, namedtuple
|
||||
from itertools import izip, islice, repeat
|
||||
import django
|
||||
|
||||
from django.db.models import Q, F
|
||||
from regluit.core import bookloader
|
||||
from django.contrib.comments.models import Comment
|
||||
|
||||
import warnings
|
||||
import datetime
|
||||
from regluit import experimental
|
||||
from regluit.experimental import bookdata
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
@ -14,6 +17,20 @@ import json
|
|||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def dictset(itertuple):
|
||||
s = defaultdict(set)
|
||||
for (k, v) in itertuple:
|
||||
s[k].add(v)
|
||||
return s
|
||||
|
||||
def dictlist(itertuple):
|
||||
d = defaultdict(list)
|
||||
for (k, v) in itertuple:
|
||||
d[k].append(v)
|
||||
return d
|
||||
|
||||
EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'ed_created', 'work_id', 'work_created', 'lang'])
|
||||
|
||||
def ry_lt_books():
|
||||
"""return parsing of rdhyee's LibraryThing collection"""
|
||||
lt = librarything.LibraryThing('rdhyee')
|
||||
|
@ -85,26 +102,163 @@ def load_gutenberg_books(fname="/Users/raymondyee/D/Document/Gluejar/Gluejar.git
|
|||
else:
|
||||
logger.info("%d null seed_isbn: ebook %s", i, ebook)
|
||||
|
||||
def cluster_status():
|
||||
def cluster_status(max_num=None):
|
||||
"""Look at the current Work, Edition instances to figure out what needs to be fixed"""
|
||||
results = OrderedDict([
|
||||
('number of Works', models.Work.objects.count()),
|
||||
('number of Editions', models.Edition.objects.count())
|
||||
('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
|
||||
('number of Editions', models.Edition.objects.count()),
|
||||
('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
|
||||
('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
|
||||
('number of Edition that have both Google Books id and ISBNs',
|
||||
models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
|
||||
('number of Editions with Google Books IDs but not ISBNs',
|
||||
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
|
||||
])
|
||||
|
||||
# What needs to be done to recluster editions?
|
||||
# models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
|
||||
# 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
|
||||
# a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
|
||||
|
||||
# [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
|
||||
|
||||
# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?
|
||||
|
||||
# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
|
||||
# an ISBN tied to that Google Books ID)
|
||||
|
||||
|
||||
import shutil
|
||||
import time
|
||||
import operator
|
||||
|
||||
|
||||
return results
|
||||
# let's form a key to map all the Editions into
|
||||
# (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
|
||||
|
||||
work_clusters = defaultdict(set)
|
||||
current_map = defaultdict(set)
|
||||
|
||||
#backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
|
||||
backup = '{0}/lt_data_back.json'.format(experimental.__path__[0])
|
||||
#fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
|
||||
fname = '{0}/lt_data.json'.format(experimental.__path__[0])
|
||||
|
||||
shutil.copy(fname, backup)
|
||||
|
||||
lt = bookdata.LibraryThing(fname)
|
||||
|
||||
try:
|
||||
input_file = open(fname, "r")
|
||||
success = lt.load()
|
||||
print "success: %s" % (success)
|
||||
input_file.close()
|
||||
except Exception, e:
|
||||
print e
|
||||
|
||||
for (i, (isbn, ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
|
||||
islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
|
||||
'edition__title', 'edition__created', 'edition__work__id',
|
||||
'edition__work__created', 'edition__work__language'), max_num)):
|
||||
|
||||
lt_work_id = lt.thingisbn(isbn, return_work_id=True)
|
||||
key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
|
||||
print i, isbn, lt_work_id, key
|
||||
work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
|
||||
work_id=work_id, work_created=work_created, lang=lang))
|
||||
current_map[work_id].add(key)
|
||||
|
||||
lt.save()
|
||||
|
||||
# Now add the Editions without any ISBNs
|
||||
print "editions w/o isbn"
|
||||
for (i, (ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
|
||||
islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
|
||||
'title', 'created', 'work__id', 'work__created', 'work__language' ), None)):
|
||||
|
||||
key = (None, lang, None, ed_id)
|
||||
print i, ed_id, ed_title.encode('ascii','ignore'), key
|
||||
work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
|
||||
work_id=work_id, work_created=work_created, lang=lang))
|
||||
current_map[work_id].add(key)
|
||||
|
||||
print "number of clusters", len(work_clusters)
|
||||
|
||||
# all unglue.it Works that contain Editions belonging to more than one newly calculated cluster are "FrankenWorks"
|
||||
franken_works = sorted([k for (k,v) in current_map.items() if len(v) > 1])
|
||||
|
||||
# let's calculate the list of users affected if delete the Frankenworks, the number of works deleted from their wishlist
|
||||
# specifically a list of emails to send out
|
||||
|
||||
affected_works = [models.Work.objects.get(id=w_id) for w_id in franken_works]
|
||||
affected_wishlists = set(reduce(operator.add, [list(w.wishlists.all()) for w in affected_works])) if len(affected_works) else set()
|
||||
|
||||
affected_emails = [w.user.email for w in affected_wishlists]
|
||||
affected_editions = reduce(operator.add, [list(w.editions.all()) for w in affected_works]) if len(affected_works) else []
|
||||
|
||||
# calculate the Comments that would have to be deleted too.
|
||||
affected_comments = reduce(operator.add, [list(Comment.objects.for_model(w)) for w in affected_works]) if len(affected_works) else []
|
||||
|
||||
# calculate the inverse of work_clusters
|
||||
wcp = dict(reduce(operator.add, [ list( izip([ed.ed_id for ed in eds], repeat(k))) for (k,eds) in work_clusters.items()]))
|
||||
|
||||
# (I'm not completely sure of this calc -- but the datetime of the latest franken-event)
|
||||
latest_franken_event = max([ max([min(map(lambda x: x[1], v)) for v in dictlist([(wcp[ed["id"]], (ed["id"], ed["created"].isoformat()))
|
||||
for ed in models.Work.objects.get(id=w_id).editions.values('id', 'created')]).values()])
|
||||
for w_id in franken_works]) if len(franken_works) else None
|
||||
|
||||
scattered_clusters = [(k, len(set(([e.work_id for e in v])))) for (k,v) in work_clusters.items() if len(set(([e.work_id for e in v]))) <> 1 ]
|
||||
|
||||
s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results, 'franken_works': franken_works,
|
||||
'wcp':wcp, 'latest_franken_event': latest_franken_event, 'affected_works':affected_works,
|
||||
'affected_comments': affected_comments, 'scattered_clusters': scattered_clusters,
|
||||
'affected_emails': affected_emails}
|
||||
|
||||
return s
|
||||
|
||||
def clean_frankenworks(s, do=False):
|
||||
# list out the email addresses of accounts with wishlists to be affected
|
||||
|
||||
print "number of email addresses: ", len(s['affected_emails'])
|
||||
print ", ".join(s['affected_emails'])
|
||||
|
||||
# list the works we delete
|
||||
print "number of FrankenWorks", len(s['franken_works'])
|
||||
print s['franken_works']
|
||||
|
||||
# delete the affected comments
|
||||
print "deleting comments"
|
||||
for (i, comment) in enumerate(s['affected_comments']):
|
||||
print i, "deleting ", comment
|
||||
if do:
|
||||
comment.delete()
|
||||
|
||||
# delete the Frankenworks
|
||||
print "deleting Frankenworks"
|
||||
for (i, work) in enumerate(s['affected_works']):
|
||||
print i, "deleting ", work.id
|
||||
if do:
|
||||
work.delete()
|
||||
|
||||
# run reclustering surgically -- calculate a set of ISBNs to feed to bookloader.add_related
|
||||
|
||||
# assuming x is a set
|
||||
popisbn = lambda x: list(x)[0].isbn if len(x) else None
|
||||
|
||||
# group scattered_clusters by LT work id
|
||||
scattered_lt = dictlist([(k[0], k) for (k,v) in s['scattered_clusters']])
|
||||
isbns = map(popisbn, [s['work_clusters'][k[0]] for k in scattered_lt.values()])
|
||||
|
||||
print "running bookloader"
|
||||
for (i, isbn) in enumerate(isbns):
|
||||
print i, isbn
|
||||
if do:
|
||||
bookloader.add_related(isbn)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue