2013-06-03 16:31:39 +00:00
|
|
|
"""
|
|
|
|
external library imports
|
|
|
|
"""
|
|
|
|
import datetime
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import warnings
|
|
|
|
|
2012-02-21 16:54:12 +00:00
|
|
|
from collections import OrderedDict, defaultdict, namedtuple
|
2013-06-03 16:31:39 +00:00
|
|
|
from datetime import datetime
|
2012-02-21 16:54:12 +00:00
|
|
|
from itertools import izip, islice, repeat
|
2013-06-03 16:31:39 +00:00
|
|
|
|
|
|
|
"""
|
|
|
|
django imports
|
|
|
|
"""
|
2012-01-28 01:06:10 +00:00
|
|
|
import django
|
2012-01-28 00:16:46 +00:00
|
|
|
|
2012-02-24 20:06:24 +00:00
|
|
|
from django.contrib.comments.models import Comment
|
2013-06-03 16:31:39 +00:00
|
|
|
from django.db.models import Q, F
|
2012-02-24 20:06:24 +00:00
|
|
|
|
2013-06-03 16:31:39 +00:00
|
|
|
"""
|
|
|
|
regluit imports
|
|
|
|
"""
|
2012-02-24 20:06:24 +00:00
|
|
|
from regluit import experimental
|
2013-06-03 16:31:39 +00:00
|
|
|
from regluit.core import librarything, bookloader, models, tasks
|
2012-02-15 02:01:13 +00:00
|
|
|
from regluit.experimental import bookdata
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2012-02-01 02:09:01 +00:00
|
|
|
|
2012-02-21 16:54:12 +00:00
|
|
|
def dictset(itertuple):
|
|
|
|
s = defaultdict(set)
|
|
|
|
for (k, v) in itertuple:
|
|
|
|
s[k].add(v)
|
|
|
|
return s
|
|
|
|
|
|
|
|
def dictlist(itertuple):
|
|
|
|
d = defaultdict(list)
|
|
|
|
for (k, v) in itertuple:
|
|
|
|
d[k].append(v)
|
|
|
|
return d
|
|
|
|
|
|
|
|
EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'ed_created', 'work_id', 'work_created', 'lang'])
|
|
|
|
|
2012-01-28 00:16:46 +00:00
|
|
|
def ry_lt_books():
|
2012-01-28 01:06:10 +00:00
|
|
|
"""return parsing of rdhyee's LibraryThing collection"""
|
2012-01-28 00:16:46 +00:00
|
|
|
lt = librarything.LibraryThing('rdhyee')
|
|
|
|
books = lt.parse_user_catalog(view_style=5)
|
|
|
|
return books
|
|
|
|
|
|
|
|
def editions_for_lt(books):
|
2012-01-28 01:06:10 +00:00
|
|
|
"""return the Editions that correspond to the list of LibraryThing books"""
|
2012-01-28 00:16:46 +00:00
|
|
|
editions = [bookloader.add_by_isbn(b["isbn"]) for b in books]
|
|
|
|
return editions
|
|
|
|
|
2012-01-28 01:06:10 +00:00
|
|
|
def ry_lt_not_loaded():
|
|
|
|
"""Calculate which of the books on rdhyee's librarything list don't yield Editions"""
|
2012-01-28 00:16:46 +00:00
|
|
|
books = list(ry_lt_books())
|
|
|
|
editions = editions_for_lt(books)
|
2012-02-15 02:01:13 +00:00
|
|
|
not_loaded_books = [b for (b, ed) in izip(books, editions) if ed is None]
|
2012-01-28 01:06:10 +00:00
|
|
|
return not_loaded_books
|
|
|
|
|
|
|
|
def ry_wish_list_equal_loadable_lt_books():
|
|
|
|
"""returnwhether the set of works in the user's wishlist is the same as the works in a user's loadable editions from LT"""
|
|
|
|
editions = editions_for_lt(ry_lt_books())
|
|
|
|
# assume only one user -- and that we have run a LT book loading process for that user
|
|
|
|
ry = django.contrib.auth.models.User.objects.all()[0]
|
2012-02-01 02:09:01 +00:00
|
|
|
return set([ed.work for ed in filter(None, editions)]) == set(ry.wishlist.works.all())
|
|
|
|
|
|
|
|
def clear_works_editions_ebooks():
|
|
|
|
models.Ebook.objects.all().delete()
|
|
|
|
models.Work.objects.all().delete()
|
|
|
|
models.Edition.objects.all().delete()
|
|
|
|
|
2012-02-02 01:09:04 +00:00
|
|
|
|
|
|
|
def load_penguin_moby_dick():
|
|
|
|
seed_isbn = '9780142000083'
|
|
|
|
ed = bookloader.add_by_isbn(seed_isbn)
|
|
|
|
if ed.new:
|
2012-02-16 18:19:36 +00:00
|
|
|
ed = tasks.populate_edition.delay(ed.isbn_13)
|
2012-02-01 02:09:01 +00:00
|
|
|
|
2012-02-15 02:01:13 +00:00
|
|
|
def load_gutenberg_moby_dick():
|
2012-02-01 02:09:01 +00:00
|
|
|
title = "Moby Dick"
|
|
|
|
ol_work_id = "/works/OL102749W"
|
|
|
|
gutenberg_etext_id = 2701
|
|
|
|
epub_url = "http://www.gutenberg.org/cache/epub/2701/pg2701.epub"
|
|
|
|
license = 'http://www.gutenberg.org/license'
|
|
|
|
lang = 'en'
|
|
|
|
format = 'epub'
|
2012-02-15 02:01:13 +00:00
|
|
|
publication_date = datetime(2001,7,1)
|
2012-02-02 01:09:04 +00:00
|
|
|
seed_isbn = '9780142000083' # http://www.amazon.com/Moby-Dick-Whale-Penguin-Classics-Deluxe/dp/0142000086
|
2012-02-01 02:09:01 +00:00
|
|
|
|
2012-02-15 02:01:13 +00:00
|
|
|
ebook = bookloader.load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn,
|
|
|
|
epub_url, format, license, lang, publication_date)
|
2012-02-02 01:09:04 +00:00
|
|
|
return ebook
|
2012-02-15 02:01:13 +00:00
|
|
|
|
2012-02-27 21:19:58 +00:00
|
|
|
def load_gutenberg_books(fname="{0}/gutenberg/g_seed_isbn.json".format(experimental.__path__[0]),
|
2012-02-15 02:01:13 +00:00
|
|
|
max_num=None):
|
|
|
|
|
|
|
|
headers = ()
|
|
|
|
f = open(fname)
|
|
|
|
records = json.load(f)
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
for (i, record) in enumerate(islice(records,max_num)):
|
|
|
|
if record['format'] == 'application/epub+zip':
|
|
|
|
record['format'] = 'epub'
|
|
|
|
elif record['format'] == 'application/pdf':
|
|
|
|
record['format'] = 'pdf'
|
2012-02-15 17:09:24 +00:00
|
|
|
if record['seed_isbn'] is not None:
|
|
|
|
ebook = bookloader.load_gutenberg_edition(**record)
|
|
|
|
logger.info("%d loaded ebook %s %s", i, ebook, record)
|
|
|
|
else:
|
|
|
|
logger.info("%d null seed_isbn: ebook %s", i, ebook)
|
2012-02-15 02:01:13 +00:00
|
|
|
|
2012-02-17 18:30:09 +00:00
|
|
|
def cluster_status(max_num=None):
|
2012-02-15 02:01:13 +00:00
|
|
|
"""Look at the current Work, Edition instances to figure out what needs to be fixed"""
|
|
|
|
results = OrderedDict([
|
|
|
|
('number of Works', models.Work.objects.count()),
|
2012-02-17 18:30:09 +00:00
|
|
|
('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
|
|
|
|
('number of Editions', models.Edition.objects.count()),
|
|
|
|
('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
|
|
|
|
('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
|
2012-02-15 02:01:13 +00:00
|
|
|
('number of Edition that have both Google Books id and ISBNs',
|
|
|
|
models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
|
|
|
|
('number of Editions with Google Books IDs but not ISBNs',
|
|
|
|
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
|
|
|
|
])
|
|
|
|
|
2012-02-17 18:30:09 +00:00
|
|
|
# models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
|
|
|
|
# 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
|
|
|
|
# a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
|
|
|
|
|
|
|
|
# [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
|
2012-02-16 00:06:40 +00:00
|
|
|
|
|
|
|
# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?
|
|
|
|
|
|
|
|
# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
|
|
|
|
# an ISBN tied to that Google Books ID)
|
2012-02-17 18:30:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
import shutil
|
|
|
|
import time
|
2012-02-21 16:54:12 +00:00
|
|
|
import operator
|
2012-02-24 20:06:24 +00:00
|
|
|
|
2012-02-17 18:30:09 +00:00
|
|
|
|
|
|
|
# let's form a key to map all the Editions into
|
|
|
|
# (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
|
|
|
|
|
|
|
|
work_clusters = defaultdict(set)
|
|
|
|
current_map = defaultdict(set)
|
|
|
|
|
2012-02-24 20:06:24 +00:00
|
|
|
#backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
|
|
|
|
backup = '{0}/lt_data_back.json'.format(experimental.__path__[0])
|
|
|
|
#fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
|
|
|
|
fname = '{0}/lt_data.json'.format(experimental.__path__[0])
|
2012-02-17 18:30:09 +00:00
|
|
|
|
|
|
|
shutil.copy(fname, backup)
|
|
|
|
|
|
|
|
lt = bookdata.LibraryThing(fname)
|
|
|
|
|
|
|
|
try:
|
|
|
|
input_file = open(fname, "r")
|
|
|
|
success = lt.load()
|
|
|
|
print "success: %s" % (success)
|
|
|
|
input_file.close()
|
|
|
|
except Exception, e:
|
|
|
|
print e
|
2012-02-15 02:01:13 +00:00
|
|
|
|
2012-02-21 16:54:12 +00:00
|
|
|
for (i, (isbn, ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
|
2012-02-17 18:30:09 +00:00
|
|
|
islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
|
2012-02-21 16:54:12 +00:00
|
|
|
'edition__title', 'edition__created', 'edition__work__id',
|
|
|
|
'edition__work__created', 'edition__work__language'), max_num)):
|
2012-02-17 18:30:09 +00:00
|
|
|
|
|
|
|
lt_work_id = lt.thingisbn(isbn, return_work_id=True)
|
|
|
|
key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
|
|
|
|
print i, isbn, lt_work_id, key
|
2012-02-21 16:54:12 +00:00
|
|
|
work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
|
|
|
|
work_id=work_id, work_created=work_created, lang=lang))
|
2012-02-17 18:30:09 +00:00
|
|
|
current_map[work_id].add(key)
|
|
|
|
|
|
|
|
lt.save()
|
|
|
|
|
|
|
|
# Now add the Editions without any ISBNs
|
|
|
|
print "editions w/o isbn"
|
2012-02-21 16:54:12 +00:00
|
|
|
for (i, (ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
|
2012-02-17 18:30:09 +00:00
|
|
|
islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
|
2012-02-21 16:54:12 +00:00
|
|
|
'title', 'created', 'work__id', 'work__created', 'work__language' ), None)):
|
2012-02-17 18:30:09 +00:00
|
|
|
|
|
|
|
key = (None, lang, None, ed_id)
|
2012-02-24 22:08:14 +00:00
|
|
|
print i, ed_id, ed_title.encode('ascii','ignore'), key
|
2012-02-21 16:54:12 +00:00
|
|
|
work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
|
|
|
|
work_id=work_id, work_created=work_created, lang=lang))
|
2012-02-17 18:30:09 +00:00
|
|
|
current_map[work_id].add(key)
|
|
|
|
|
|
|
|
print "number of clusters", len(work_clusters)
|
|
|
|
|
2012-02-21 16:54:12 +00:00
|
|
|
# all unglue.it Works that contain Editions belonging to more than one newly calculated cluster are "FrankenWorks"
|
|
|
|
franken_works = sorted([k for (k,v) in current_map.items() if len(v) > 1])
|
|
|
|
|
2012-02-24 20:06:24 +00:00
|
|
|
# let's calculate the list of users affected if delete the Frankenworks, the number of works deleted from their wishlist
|
|
|
|
# specifically a list of emails to send out
|
|
|
|
|
|
|
|
affected_works = [models.Work.objects.get(id=w_id) for w_id in franken_works]
|
|
|
|
affected_wishlists = set(reduce(operator.add, [list(w.wishlists.all()) for w in affected_works])) if len(affected_works) else set()
|
|
|
|
|
|
|
|
affected_emails = [w.user.email for w in affected_wishlists]
|
|
|
|
affected_editions = reduce(operator.add, [list(w.editions.all()) for w in affected_works]) if len(affected_works) else []
|
|
|
|
|
|
|
|
# calculate the Comments that would have to be deleted too.
|
|
|
|
affected_comments = reduce(operator.add, [list(Comment.objects.for_model(w)) for w in affected_works]) if len(affected_works) else []
|
|
|
|
|
2012-02-21 16:54:12 +00:00
|
|
|
# calculate the inverse of work_clusters
|
|
|
|
wcp = dict(reduce(operator.add, [ list( izip([ed.ed_id for ed in eds], repeat(k))) for (k,eds) in work_clusters.items()]))
|
|
|
|
|
|
|
|
# (I'm not completely sure of this calc -- but the datetime of the latest franken-event)
|
|
|
|
latest_franken_event = max([ max([min(map(lambda x: x[1], v)) for v in dictlist([(wcp[ed["id"]], (ed["id"], ed["created"].isoformat()))
|
|
|
|
for ed in models.Work.objects.get(id=w_id).editions.values('id', 'created')]).values()])
|
2012-02-24 20:06:24 +00:00
|
|
|
for w_id in franken_works]) if len(franken_works) else None
|
2012-02-21 16:54:12 +00:00
|
|
|
|
2012-02-24 20:06:24 +00:00
|
|
|
scattered_clusters = [(k, len(set(([e.work_id for e in v])))) for (k,v) in work_clusters.items() if len(set(([e.work_id for e in v]))) <> 1 ]
|
2012-02-17 18:30:09 +00:00
|
|
|
|
2012-02-24 20:06:24 +00:00
|
|
|
s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results, 'franken_works': franken_works,
|
|
|
|
'wcp':wcp, 'latest_franken_event': latest_franken_event, 'affected_works':affected_works,
|
|
|
|
'affected_comments': affected_comments, 'scattered_clusters': scattered_clusters,
|
|
|
|
'affected_emails': affected_emails}
|
2012-02-17 18:30:09 +00:00
|
|
|
|
|
|
|
return s
|
|
|
|
|
2012-02-24 20:06:24 +00:00
|
|
|
def clean_frankenworks(s, do=False):
|
|
|
|
# list out the email addresses of accounts with wishlists to be affected
|
|
|
|
|
|
|
|
print "number of email addresses: ", len(s['affected_emails'])
|
|
|
|
print ", ".join(s['affected_emails'])
|
|
|
|
|
|
|
|
# list the works we delete
|
|
|
|
print "number of FrankenWorks", len(s['franken_works'])
|
|
|
|
print s['franken_works']
|
|
|
|
|
|
|
|
# delete the affected comments
|
|
|
|
print "deleting comments"
|
|
|
|
for (i, comment) in enumerate(s['affected_comments']):
|
|
|
|
print i, "deleting ", comment
|
|
|
|
if do:
|
|
|
|
comment.delete()
|
|
|
|
|
|
|
|
# delete the Frankenworks
|
|
|
|
print "deleting Frankenworks"
|
|
|
|
for (i, work) in enumerate(s['affected_works']):
|
2012-02-24 22:08:14 +00:00
|
|
|
print i, "deleting ", work.id
|
2012-02-24 20:06:24 +00:00
|
|
|
if do:
|
|
|
|
work.delete()
|
|
|
|
|
|
|
|
# run reclustering surgically -- calculate a set of ISBNs to feed to bookloader.add_related
|
|
|
|
|
|
|
|
# assuming x is a set
|
|
|
|
popisbn = lambda x: list(x)[0].isbn if len(x) else None
|
|
|
|
|
|
|
|
# group scattered_clusters by LT work id
|
|
|
|
scattered_lt = dictlist([(k[0], k) for (k,v) in s['scattered_clusters']])
|
|
|
|
isbns = map(popisbn, [s['work_clusters'][k[0]] for k in scattered_lt.values()])
|
|
|
|
|
|
|
|
print "running bookloader"
|
|
|
|
for (i, isbn) in enumerate(isbns):
|
|
|
|
print i, isbn
|
|
|
|
if do:
|
|
|
|
bookloader.add_related(isbn)
|
|
|
|
|
|
|
|
|
|
|
|
|
2012-02-15 02:01:13 +00:00
|
|
|
|
2012-02-24 20:06:24 +00:00
|
|
|
|
2012-02-16 03:36:18 +00:00
|
|
|
|
2012-02-15 02:01:13 +00:00
|
|
|
|
|
|
|
|