""" external library imports """ import datetime import json import logging import warnings from collections import OrderedDict, defaultdict, namedtuple from datetime import datetime from itertools import izip, islice, repeat """ django imports """ import django from django_comments.models import Comment from django.db.models import Q, F """ regluit imports """ from regluit import experimental from regluit.core import librarything, bookloader, models, tasks from regluit.experimental import bookdata logger = logging.getLogger(__name__) def dictset(itertuple): s = defaultdict(set) for (k, v) in itertuple: s[k].add(v) return s def dictlist(itertuple): d = defaultdict(list) for (k, v) in itertuple: d[k].append(v) return d EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'ed_created', 'work_id', 'work_created', 'lang']) def ry_lt_books(): """return parsing of rdhyee's LibraryThing collection""" lt = librarything.LibraryThing('rdhyee') books = lt.parse_user_catalog(view_style=5) return books def editions_for_lt(books): """return the Editions that correspond to the list of LibraryThing books""" editions = [bookloader.add_by_isbn(b["isbn"]) for b in books] return editions def ry_lt_not_loaded(): """Calculate which of the books on rdhyee's librarything list don't yield Editions""" books = list(ry_lt_books()) editions = editions_for_lt(books) not_loaded_books = [b for (b, ed) in izip(books, editions) if ed is None] return not_loaded_books def ry_wish_list_equal_loadable_lt_books(): """returnwhether the set of works in the user's wishlist is the same as the works in a user's loadable editions from LT""" editions = editions_for_lt(ry_lt_books()) # assume only one user -- and that we have run a LT book loading process for that user ry = django.contrib.auth.models.User.objects.all()[0] return set([ed.work for ed in filter(None, editions)]) == set(ry.wishlist.works.all()) def clear_works_editions_ebooks(): models.Ebook.objects.all().delete() models.Work.objects.all().delete() models.Edition.objects.all().delete() def load_penguin_moby_dick(): seed_isbn = '9780142000083' ed = bookloader.add_by_isbn(seed_isbn) if ed.new: ed = tasks.populate_edition.delay(ed.isbn_13) def load_gutenberg_moby_dick(): title = "Moby Dick" ol_work_id = "/works/OL102749W" gutenberg_etext_id = 2701 epub_url = "http://www.gutenberg.org/cache/epub/2701/pg2701.epub" license = 'http://www.gutenberg.org/license' lang = 'en' format = 'epub' publication_date = datetime(2001,7,1) seed_isbn = '9780142000083' # http://www.amazon.com/Moby-Dick-Whale-Penguin-Classics-Deluxe/dp/0142000086 ebook = bookloader.load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, epub_url, format, license, lang, publication_date) return ebook def load_gutenberg_books(fname="{0}/gutenberg/g_seed_isbn.json".format(experimental.__path__[0]), max_num=None): headers = () f = open(fname) records = json.load(f) f.close() for (i, record) in enumerate(islice(records,max_num)): if record['format'] == 'application/epub+zip': record['format'] = 'epub' elif record['format'] == 'application/pdf': record['format'] = 'pdf' if record['seed_isbn'] is not None: ebook = bookloader.load_gutenberg_edition(**record) logger.info("%d loaded ebook %s %s", i, ebook, record) else: logger.info("%d null seed_isbn: ebook %s", i, ebook) def cluster_status(max_num=None): """Look at the current Work, Edition instances to figure out what needs to be fixed""" results = OrderedDict([ ('number of Works', models.Work.objects.count()), ('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()), ('number of Editions', models.Edition.objects.count()), ('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()), ('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()), ('number of Edition that have both Google Books id and ISBNs', models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()), ('number of Editions with Google Books IDs but not ISBNs', models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()), ]) # models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count() # 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT # a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all # [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting # Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons? # identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with # an ISBN tied to that Google Books ID) import shutil import time import operator # let's form a key to map all the Editions into # (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) ) work_clusters = defaultdict(set) current_map = defaultdict(set) #backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json' backup = '{0}/lt_data_back.json'.format(experimental.__path__[0]) #fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json' fname = '{0}/lt_data.json'.format(experimental.__path__[0]) shutil.copy(fname, backup) lt = bookdata.LibraryThing(fname) try: input_file = open(fname, "r") success = lt.load() print "success: %s" % (success) input_file.close() except Exception, e: print e for (i, (isbn, ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate( islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__title', 'edition__created', 'edition__work__id', 'edition__work__created', 'edition__work__language'), max_num)): lt_work_id = lt.thingisbn(isbn, return_work_id=True) key = (lt_work_id, lang, isbn if lt_work_id is None else None, None) print i, isbn, lt_work_id, key work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created, work_id=work_id, work_created=work_created, lang=lang)) current_map[work_id].add(key) lt.save() # Now add the Editions without any ISBNs print "editions w/o isbn" for (i, (ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate( islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id', 'title', 'created', 'work__id', 'work__created', 'work__language' ), None)): key = (None, lang, None, ed_id) print i, ed_id, ed_title.encode('ascii','ignore'), key work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created, work_id=work_id, work_created=work_created, lang=lang)) current_map[work_id].add(key) print "number of clusters", len(work_clusters) # all unglue.it Works that contain Editions belonging to more than one newly calculated cluster are "FrankenWorks" franken_works = sorted([k for (k,v) in current_map.items() if len(v) > 1]) # let's calculate the list of users affected if delete the Frankenworks, the number of works deleted from their wishlist # specifically a list of emails to send out affected_works = [models.Work.objects.get(id=w_id) for w_id in franken_works] affected_wishlists = set(reduce(operator.add, [list(w.wishlists.all()) for w in affected_works])) if len(affected_works) else set() affected_emails = [w.user.email for w in affected_wishlists] affected_editions = reduce(operator.add, [list(w.editions.all()) for w in affected_works]) if len(affected_works) else [] # calculate the Comments that would have to be deleted too. affected_comments = reduce(operator.add, [list(Comment.objects.for_model(w)) for w in affected_works]) if len(affected_works) else [] # calculate the inverse of work_clusters wcp = dict(reduce(operator.add, [ list( izip([ed.ed_id for ed in eds], repeat(k))) for (k,eds) in work_clusters.items()])) # (I'm not completely sure of this calc -- but the datetime of the latest franken-event) latest_franken_event = max([ max([min(map(lambda x: x[1], v)) for v in dictlist([(wcp[ed["id"]], (ed["id"], ed["created"].isoformat())) for ed in models.Work.objects.get(id=w_id).editions.values('id', 'created')]).values()]) for w_id in franken_works]) if len(franken_works) else None scattered_clusters = [(k, len(set(([e.work_id for e in v])))) for (k,v) in work_clusters.items() if len(set(([e.work_id for e in v]))) <> 1 ] s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results, 'franken_works': franken_works, 'wcp':wcp, 'latest_franken_event': latest_franken_event, 'affected_works':affected_works, 'affected_comments': affected_comments, 'scattered_clusters': scattered_clusters, 'affected_emails': affected_emails} return s def clean_frankenworks(s, do=False): # list out the email addresses of accounts with wishlists to be affected print "number of email addresses: ", len(s['affected_emails']) print ", ".join(s['affected_emails']) # list the works we delete print "number of FrankenWorks", len(s['franken_works']) print s['franken_works'] # delete the affected comments print "deleting comments" for (i, comment) in enumerate(s['affected_comments']): print i, "deleting ", comment if do: comment.delete() # delete the Frankenworks print "deleting Frankenworks" for (i, work) in enumerate(s['affected_works']): print i, "deleting ", work.id if do: work.delete() # run reclustering surgically -- calculate a set of ISBNs to feed to bookloader.add_related # assuming x is a set popisbn = lambda x: list(x)[0].isbn if len(x) else None # group scattered_clusters by LT work id scattered_lt = dictlist([(k[0], k) for (k,v) in s['scattered_clusters']]) isbns = map(popisbn, [s['work_clusters'][k[0]] for k in scattered_lt.values()]) print "running bookloader" for (i, isbn) in enumerate(isbns): print i, isbn if do: bookloader.add_related(isbn)