regluit/test/booktests.py

from regluit.core import librarything, bookloader, models, tasks
from collections import OrderedDict, defaultdict, namedtuple
from itertools import izip, islice, repeat
import django

from django.db.models import Q, F
from regluit.core import bookloader
from django.contrib.comments.models import Comment

import warnings
import datetime
from regluit import experimental
from regluit.experimental import bookdata
from datetime import datetime
import json

import logging
logger = logging.getLogger(__name__)

def dictset(itertuple):
    s = defaultdict(set)
    for (k, v) in itertuple:
        s[k].add(v)
    return s

def dictlist(itertuple):
    d = defaultdict(list)
    for (k, v) in itertuple:
        d[k].append(v)
    return d    
    
EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'ed_created', 'work_id', 'work_created', 'lang'])
    
def ry_lt_books():
    """return parsing of rdhyee's LibraryThing collection"""
    lt = librarything.LibraryThing('rdhyee')
    books = lt.parse_user_catalog(view_style=5)
    return books

def editions_for_lt(books):
    """return the Editions that correspond to the list of LibraryThing books"""
    editions = [bookloader.add_by_isbn(b["isbn"]) for b in books]
    return editions

def ry_lt_not_loaded():
    """Calculate which of the books on rdhyee's librarything list don't yield Editions"""
    books = list(ry_lt_books())
    editions = editions_for_lt(books)
    not_loaded_books = [b for (b, ed) in izip(books, editions) if ed is None]
    return not_loaded_books

def ry_wish_list_equal_loadable_lt_books():
    """returnwhether the set of works in the user's wishlist is the same as the works in a user's loadable editions from LT"""
    editions = editions_for_lt(ry_lt_books())
    # assume only one user -- and that we have run a LT book loading process for that user
    ry = django.contrib.auth.models.User.objects.all()[0]
    return set([ed.work for ed in filter(None, editions)]) == set(ry.wishlist.works.all())

def clear_works_editions_ebooks():
    models.Ebook.objects.all().delete()
    models.Work.objects.all().delete()
    models.Edition.objects.all().delete()
    
               
def load_penguin_moby_dick():
    seed_isbn = '9780142000083'
    ed = bookloader.add_by_isbn(seed_isbn)
    if ed.new:
        ed = tasks.populate_edition.delay(ed.isbn_13)

def load_gutenberg_moby_dick():
    title = "Moby Dick"
    ol_work_id = "/works/OL102749W"
    gutenberg_etext_id = 2701
    epub_url = "http://www.gutenberg.org/cache/epub/2701/pg2701.epub"
    license = 'http://www.gutenberg.org/license'
    lang = 'en'
    format = 'epub'
    publication_date = datetime(2001,7,1)
    seed_isbn = '9780142000083' # http://www.amazon.com/Moby-Dick-Whale-Penguin-Classics-Deluxe/dp/0142000086
    
    ebook = bookloader.load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn,
                                              epub_url, format, license, lang, publication_date)
    return ebook

def load_gutenberg_books(fname="{0}/gutenberg/g_seed_isbn.json".format(experimental.__path__[0]),
                         max_num=None):
    
    headers = ()
    f = open(fname)
    records = json.load(f)
    f.close()
    
    for (i, record) in enumerate(islice(records,max_num)):
        if record['format'] == 'application/epub+zip':
            record['format'] = 'epub'
        elif record['format'] == 'application/pdf':
            record['format'] = 'pdf'
        if record['seed_isbn'] is not None:
            ebook = bookloader.load_gutenberg_edition(**record)
            logger.info("%d loaded ebook %s %s", i, ebook, record)
        else:
            logger.info("%d null seed_isbn: ebook %s", i, ebook)

def cluster_status(max_num=None):
    """Look at the current Work, Edition instances to figure out what needs to be fixed"""
    results = OrderedDict([
        ('number of Works', models.Work.objects.count()),
        ('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),
        ('number of Editions', models.Edition.objects.count()),
        ('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),
        ('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),
        ('number of Edition that have both Google Books id and ISBNs',
             models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
        ('number of Editions with Google Books IDs but not ISBNs',
             models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
        ])
    
    # models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()
    # 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT
    # a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all
    
    # [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting
    
    # Are there Edition without ISBNs?  Look up the corresponding ISBNs from Google Books and Are they all singletons?
    
    # identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
    # an ISBN tied to that Google Books ID)


    import shutil
    import time
    import operator
 
    
    # let's form a key to map all the Editions into
    # (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )
    
    work_clusters = defaultdict(set)
    current_map = defaultdict(set)
    
    #backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'
    backup = '{0}/lt_data_back.json'.format(experimental.__path__[0])
    #fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'
    fname = '{0}/lt_data.json'.format(experimental.__path__[0])
    
    shutil.copy(fname, backup)
        
    lt = bookdata.LibraryThing(fname)

    try:
        input_file = open(fname, "r")
        success = lt.load()
        print "success: %s" % (success)
        input_file.close()
    except Exception, e:
        print e
    
    for (i, (isbn, ed_id, ed_title, ed_created,  work_id, work_created, lang)) in enumerate(
        islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',
                'edition__title', 'edition__created', 'edition__work__id',
                'edition__work__created', 'edition__work__language'), max_num)):
        
        lt_work_id = lt.thingisbn(isbn, return_work_id=True)
        key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)
        print i, isbn, lt_work_id, key
        work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
                                      work_id=work_id, work_created=work_created, lang=lang))
        current_map[work_id].add(key)
    
    lt.save()
    
    # Now add the Editions without any ISBNs
    print "editions w/o isbn"
    for (i, (ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(
        islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',
                'title', 'created', 'work__id', 'work__created', 'work__language' ), None)):
        
        key = (None, lang, None, ed_id)
        print i, ed_id, ed_title.encode('ascii','ignore'), key
        work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,
                                      work_id=work_id, work_created=work_created, lang=lang))
        current_map[work_id].add(key)

    print "number of clusters", len(work_clusters)
    
    # all unglue.it Works that contain Editions belonging to more than one newly calculated cluster are "FrankenWorks"
    franken_works = sorted([k for (k,v) in current_map.items() if len(v) > 1])
    
    # let's calculate the list of users affected if delete the Frankenworks, the number of works deleted from their wishlist
    # specifically a list of emails to send out
    
    affected_works = [models.Work.objects.get(id=w_id)  for w_id in franken_works]
    affected_wishlists = set(reduce(operator.add, [list(w.wishlists.all())  for w in affected_works])) if len(affected_works) else set()
    
    affected_emails = [w.user.email  for w in affected_wishlists]
    affected_editions = reduce(operator.add, [list(w.editions.all()) for w in affected_works]) if len(affected_works) else []
    
    # calculate the Comments that would have to be deleted too.
    affected_comments = reduce(operator.add, [list(Comment.objects.for_model(w)) for w in affected_works]) if len(affected_works) else []
    
    # calculate the inverse of work_clusters
    wcp = dict(reduce(operator.add, [ list( izip([ed.ed_id for ed in eds], repeat(k))) for (k,eds) in work_clusters.items()]))
    
    # (I'm not completely sure of this calc -- but the datetime of the latest franken-event)
    latest_franken_event = max([ max([min(map(lambda x: x[1], v)) for v in dictlist([(wcp[ed["id"]], (ed["id"], ed["created"].isoformat()))
        for ed in models.Work.objects.get(id=w_id).editions.values('id', 'created')]).values()])
         for w_id in franken_works]) if len(franken_works) else None
    
    scattered_clusters = [(k, len(set(([e.work_id for e in v])))) for (k,v) in work_clusters.items() if len(set(([e.work_id for e in v]))) <> 1 ]    
    
    s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results, 'franken_works': franken_works,
         'wcp':wcp, 'latest_franken_event': latest_franken_event, 'affected_works':affected_works,
         'affected_comments': affected_comments, 'scattered_clusters': scattered_clusters,
         'affected_emails': affected_emails}
    
    return s

def clean_frankenworks(s, do=False):
    # list out the email addresses of accounts with wishlists to be affected
    
    print "number of email addresses: ", len(s['affected_emails'])
    print ", ".join(s['affected_emails'])
    
    # list the works we delete
    print "number of FrankenWorks", len(s['franken_works'])
    print s['franken_works']
    
    # delete the affected comments
    print "deleting comments"
    for (i, comment) in enumerate(s['affected_comments']):
        print i, "deleting ", comment
        if do:
            comment.delete()
    
    # delete the Frankenworks
    print "deleting Frankenworks"
    for (i, work) in enumerate(s['affected_works']):
        print i, "deleting ", work.id
        if do:
            work.delete()    
    
    # run reclustering surgically -- calculate a set of ISBNs to feed to bookloader.add_related
    
    # assuming x is a set
    popisbn = lambda x: list(x)[0].isbn if len(x) else None
    
    # group scattered_clusters by LT work id
    scattered_lt = dictlist([(k[0], k) for (k,v) in s['scattered_clusters']])
    isbns = map(popisbn, [s['work_clusters'][k[0]] for k in scattered_lt.values()])
    
    print "running bookloader"
    for (i, isbn) in enumerate(isbns):
        print i, isbn
        if do:
            bookloader.add_related(isbn)
Getting close to implementing Gutenberg loading for at least one book Moby Dick 2012-02-02 01:09:04 +00:00			`from regluit.core import librarything, bookloader, models, tasks`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`from collections import OrderedDict, defaultdict, namedtuple`
			`from itertools import izip, islice, repeat`
Comment on the "tests" 2012-01-28 01:06:10 +00:00			`import django`
Changed the URL so that "all collections" loaded from Librarything, not just "my library" In add_related, make sure edition is not None before trying to add related editions A demo test 2012-01-28 00:16:46 +00:00
A stepping stone towards loading Gutenberg books. Loading Moby Dick seems to be working more or less 2012-02-01 02:09:01 +00:00			`from django.db.models import Q, F`
			`from regluit.core import bookloader`
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`from django.contrib.comments.models import Comment`

A stepping stone towards loading Gutenberg books. Loading Moby Dick seems to be working more or less 2012-02-01 02:09:01 +00:00			`import warnings`
			`import datetime`
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`from regluit import experimental`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`from regluit.experimental import bookdata`
			`from datetime import datetime`
			`import json`

			`import logging`
			`logger = logging.getLogger(__name__)`
A stepping stone towards loading Gutenberg books. Loading Moby Dick seems to be working more or less 2012-02-01 02:09:01 +00:00
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`def dictset(itertuple):`
			`s = defaultdict(set)`
			`for (k, v) in itertuple:`
			`s[k].add(v)`
			`return s`

			`def dictlist(itertuple):`
			`d = defaultdict(list)`
			`for (k, v) in itertuple:`
			`d[k].append(v)`
			`return d`

			`EdInfo = namedtuple('EdInfo', ['isbn', 'ed_id', 'ed_title', 'ed_created', 'work_id', 'work_created', 'lang'])`

Changed the URL so that "all collections" loaded from Librarything, not just "my library" In add_related, make sure edition is not None before trying to add related editions A demo test 2012-01-28 00:16:46 +00:00			`def ry_lt_books():`
Comment on the "tests" 2012-01-28 01:06:10 +00:00			`"""return parsing of rdhyee's LibraryThing collection"""`
Changed the URL so that "all collections" loaded from Librarything, not just "my library" In add_related, make sure edition is not None before trying to add related editions A demo test 2012-01-28 00:16:46 +00:00			`lt = librarything.LibraryThing('rdhyee')`
			`books = lt.parse_user_catalog(view_style=5)`
			`return books`

			`def editions_for_lt(books):`
Comment on the "tests" 2012-01-28 01:06:10 +00:00			`"""return the Editions that correspond to the list of LibraryThing books"""`
Changed the URL so that "all collections" loaded from Librarything, not just "my library" In add_related, make sure edition is not None before trying to add related editions A demo test 2012-01-28 00:16:46 +00:00			`editions = [bookloader.add_by_isbn(b["isbn"]) for b in books]`
			`return editions`

Comment on the "tests" 2012-01-28 01:06:10 +00:00			`def ry_lt_not_loaded():`
			`"""Calculate which of the books on rdhyee's librarything list don't yield Editions"""`
Changed the URL so that "all collections" loaded from Librarything, not just "my library" In add_related, make sure edition is not None before trying to add related editions A demo test 2012-01-28 00:16:46 +00:00			`books = list(ry_lt_books())`
			`editions = editions_for_lt(books)`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`not_loaded_books = [b for (b, ed) in izip(books, editions) if ed is None]`
Comment on the "tests" 2012-01-28 01:06:10 +00:00			`return not_loaded_books`

			`def ry_wish_list_equal_loadable_lt_books():`
			`"""returnwhether the set of works in the user's wishlist is the same as the works in a user's loadable editions from LT"""`
			`editions = editions_for_lt(ry_lt_books())`
			`# assume only one user -- and that we have run a LT book loading process for that user`
			`ry = django.contrib.auth.models.User.objects.all()[0]`
A stepping stone towards loading Gutenberg books. Loading Moby Dick seems to be working more or less 2012-02-01 02:09:01 +00:00			`return set([ed.work for ed in filter(None, editions)]) == set(ry.wishlist.works.all())`

			`def clear_works_editions_ebooks():`
			`models.Ebook.objects.all().delete()`
			`models.Work.objects.all().delete()`
			`models.Edition.objects.all().delete()`

Getting close to implementing Gutenberg loading for at least one book Moby Dick 2012-02-02 01:09:04 +00:00
			`def load_penguin_moby_dick():`
			`seed_isbn = '9780142000083'`
			`ed = bookloader.add_by_isbn(seed_isbn)`
			`if ed.new:`
changed core.tasks to not use models 2012-02-16 18:19:36 +00:00			`ed = tasks.populate_edition.delay(ed.isbn_13)`
A stepping stone towards loading Gutenberg books. Loading Moby Dick seems to be working more or less 2012-02-01 02:09:01 +00:00
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`def load_gutenberg_moby_dick():`
A stepping stone towards loading Gutenberg books. Loading Moby Dick seems to be working more or less 2012-02-01 02:09:01 +00:00			`title = "Moby Dick"`
			`ol_work_id = "/works/OL102749W"`
			`gutenberg_etext_id = 2701`
			`epub_url = "http://www.gutenberg.org/cache/epub/2701/pg2701.epub"`
			`license = 'http://www.gutenberg.org/license'`
			`lang = 'en'`
			`format = 'epub'`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`publication_date = datetime(2001,7,1)`
Getting close to implementing Gutenberg loading for at least one book Moby Dick 2012-02-02 01:09:04 +00:00			`seed_isbn = '9780142000083' # http://www.amazon.com/Moby-Dick-Whale-Penguin-Classics-Deluxe/dp/0142000086`
A stepping stone towards loading Gutenberg books. Loading Moby Dick seems to be working more or less 2012-02-01 02:09:01 +00:00
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`ebook = bookloader.load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn,`
			`epub_url, format, license, lang, publication_date)`
Getting close to implementing Gutenberg loading for at least one book Moby Dick 2012-02-02 01:09:04 +00:00			`return ebook`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00
Add g_seed_isbn.json which hold the Gutenberg editions I'm loading. 2012-02-27 21:19:58 +00:00			`def load_gutenberg_books(fname="{0}/gutenberg/g_seed_isbn.json".format(experimental.__path__[0]),`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`max_num=None):`

			`headers = ()`
			`f = open(fname)`
			`records = json.load(f)`
			`f.close()`

			`for (i, record) in enumerate(islice(records,max_num)):`
			`if record['format'] == 'application/epub+zip':`
			`record['format'] = 'epub'`
			`elif record['format'] == 'application/pdf':`
			`record['format'] = 'pdf'`
Don't load gutenberg book if seed_isbn is None 2012-02-15 17:09:24 +00:00			`if record['seed_isbn'] is not None:`
			`ebook = bookloader.load_gutenberg_edition(**record)`
			`logger.info("%d loaded ebook %s %s", i, ebook, record)`
			`else:`
			`logger.info("%d null seed_isbn: ebook %s", i, ebook)`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00			`def cluster_status(max_num=None):`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`"""Look at the current Work, Edition instances to figure out what needs to be fixed"""`
			`results = OrderedDict([`
			`('number of Works', models.Work.objects.count()),`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00			`('number of Works w/o Identifier', models.Work.objects.filter(identifiers__isnull=True).count()),`
			`('number of Editions', models.Edition.objects.count()),`
			`('number of Editions with ISBN', models.Edition.objects.filter(identifiers__type='isbn').count()),`
			`('number of Editions without ISBNs', models.Edition.objects.exclude(identifiers__type='isbn').count()),`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00			`('number of Edition that have both Google Books id and ISBNs',`
			`models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),`
			`('number of Editions with Google Books IDs but not ISBNs',`
			`models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),`
			`])`

Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00			`# models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id', 'edition__work__id', 'edition__work__language').count()`
			`# 4 classes -- Edition have ISBN or not & ISBN is recognized or not by LT`
			`# a) ISBN recognized by LT, b) ISBN not recognized by LT, c) no ISBN at all`

			`# [w._meta.get_all_related_objects() for w in works_no_ids] -- try to figure out whether any related objects before deleting`
Check current progress in so that I can focus on a change in the master branch to add missing isbns to Editions 2012-02-16 00:06:40 +00:00
			`# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?`

			`# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with`
			`# an ISBN tied to that Google Books ID)`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00

			`import shutil`
			`import time`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`import operator`
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00
			`# let's form a key to map all the Editions into`
			`# (lt_work_id (or None), lang, ISBN (if lt_work_id is None or None if we don't know it), ed_id (or None) )`

			`work_clusters = defaultdict(set)`
			`current_map = defaultdict(set)`

Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`#backup = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data_back.json'`
			`backup = '{0}/lt_data_back.json'.format(experimental.__path__[0])`
			`#fname = '/Users/raymondyee/D/Document/Gluejar/Gluejar.github/regluit/experimental/lt_data.json'`
			`fname = '{0}/lt_data.json'.format(experimental.__path__[0])`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00
			`shutil.copy(fname, backup)`

			`lt = bookdata.LibraryThing(fname)`

			`try:`
			`input_file = open(fname, "r")`
			`success = lt.load()`
			`print "success: %s" % (success)`
			`input_file.close()`
			`except Exception, e:`
			`print e`
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`for (i, (isbn, ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00			`islice(models.Identifier.objects.filter(type='isbn').values_list('value', 'edition__id',`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`'edition__title', 'edition__created', 'edition__work__id',`
			`'edition__work__created', 'edition__work__language'), max_num)):`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00
			`lt_work_id = lt.thingisbn(isbn, return_work_id=True)`
			`key = (lt_work_id, lang, isbn if lt_work_id is None else None, None)`
			`print i, isbn, lt_work_id, key`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`work_clusters[key].add(EdInfo(isbn=isbn, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,`
			`work_id=work_id, work_created=work_created, lang=lang))`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00			`current_map[work_id].add(key)`

			`lt.save()`

			`# Now add the Editions without any ISBNs`
			`print "editions w/o isbn"`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`for (i, (ed_id, ed_title, ed_created, work_id, work_created, lang)) in enumerate(`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00			`islice(models.Edition.objects.exclude(identifiers__type='isbn').values_list('id',`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`'title', 'created', 'work__id', 'work__created', 'work__language' ), None)):`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00
			`key = (None, lang, None, ed_id)`
Need to fix print statement to emit only ascii 2012-02-24 22:08:14 +00:00			`print i, ed_id, ed_title.encode('ascii','ignore'), key`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`work_clusters[key].add(EdInfo(isbn=None, ed_id=ed_id, ed_title=ed_title, ed_created=ed_created,`
			`work_id=work_id, work_created=work_created, lang=lang))`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00			`current_map[work_id].add(key)`

			`print "number of clusters", len(work_clusters)`

Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`# all unglue.it Works that contain Editions belonging to more than one newly calculated cluster are "FrankenWorks"`
			`franken_works = sorted([k for (k,v) in current_map.items() if len(v) > 1])`

Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`# let's calculate the list of users affected if delete the Frankenworks, the number of works deleted from their wishlist`
			`# specifically a list of emails to send out`

			`affected_works = [models.Work.objects.get(id=w_id) for w_id in franken_works]`
			`affected_wishlists = set(reduce(operator.add, [list(w.wishlists.all()) for w in affected_works])) if len(affected_works) else set()`

			`affected_emails = [w.user.email for w in affected_wishlists]`
			`affected_editions = reduce(operator.add, [list(w.editions.all()) for w in affected_works]) if len(affected_works) else []`

			`# calculate the Comments that would have to be deleted too.`
			`affected_comments = reduce(operator.add, [list(Comment.objects.for_model(w)) for w in affected_works]) if len(affected_works) else []`

Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00			`# calculate the inverse of work_clusters`
			`wcp = dict(reduce(operator.add, [ list( izip([ed.ed_id for ed in eds], repeat(k))) for (k,eds) in work_clusters.items()]))`

			`# (I'm not completely sure of this calc -- but the datetime of the latest franken-event)`
			`latest_franken_event = max([ max([min(map(lambda x: x[1], v)) for v in dictlist([(wcp[ed["id"]], (ed["id"], ed["created"].isoformat()))`
			`for ed in models.Work.objects.get(id=w_id).editions.values('id', 'created')]).values()])`
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`for w_id in franken_works]) if len(franken_works) else None`
Now I think I'm able to calculate the timedate of when the latest "frankenwork" merging is happening 2012-02-21 16:54:12 +00:00
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`scattered_clusters = [(k, len(set(([e.work_id for e in v])))) for (k,v) in work_clusters.items() if len(set(([e.work_id for e in v]))) <> 1 ]`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`s = {'work_clusters':work_clusters, 'current_map':current_map, 'results':results, 'franken_works': franken_works,`
			`'wcp':wcp, 'latest_franken_event': latest_franken_event, 'affected_works':affected_works,`
			`'affected_comments': affected_comments, 'scattered_clusters': scattered_clusters,`
			`'affected_emails': affected_emails}`
Now I have booktests to recalculate clusters 2012-02-17 18:30:09 +00:00
			`return s`

Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`def clean_frankenworks(s, do=False):`
			`# list out the email addresses of accounts with wishlists to be affected`

			`print "number of email addresses: ", len(s['affected_emails'])`
			`print ", ".join(s['affected_emails'])`

			`# list the works we delete`
			`print "number of FrankenWorks", len(s['franken_works'])`
			`print s['franken_works']`

			`# delete the affected comments`
			`print "deleting comments"`
			`for (i, comment) in enumerate(s['affected_comments']):`
			`print i, "deleting ", comment`
			`if do:`
			`comment.delete()`

			`# delete the Frankenworks`
			`print "deleting Frankenworks"`
			`for (i, work) in enumerate(s['affected_works']):`
Need to fix print statement to emit only ascii 2012-02-24 22:08:14 +00:00			`print i, "deleting ", work.id`
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00			`if do:`
			`work.delete()`

			`# run reclustering surgically -- calculate a set of ISBNs to feed to bookloader.add_related`

			`# assuming x is a set`
			`popisbn = lambda x: list(x)[0].isbn if len(x) else None`

			`# group scattered_clusters by LT work id`
			`scattered_lt = dictlist([(k[0], k) for (k,v) in s['scattered_clusters']])`
			`isbns = map(popisbn, [s['work_clusters'][k[0]] for k in scattered_lt.values()])`

			`print "running bookloader"`
			`for (i, isbn) in enumerate(isbns):`
			`print i, isbn`
			`if do:`
			`bookloader.add_related(isbn)`



At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00
Programs and data for fighting Frankenworks 2012-02-24 20:06:24 +00:00
Move add_missing_isbn_to_editions to bookloader.py 2012-02-16 03:36:18 +00:00
At this point, I have logic in regluit.test.bookloader.load_gutenberg_books to read the data from regluit/experimental/gutenberg/g_seed_isbn.json and load books into the db. Still shaking out bugs from the process though. 2012-02-15 02:01:13 +00:00