From a8f1c157bebd3c1eb33eac3283eadcd9f079a8df Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 15 Feb 2012 16:06:40 -0800 Subject: [PATCH] Check current progress in so that I can focus on a change in the master branch to add missing isbns to Editions --- experimental/bookdata.py | 36 +++++++++++++++++++++-------- experimental/gutenberg/gutenberg.py | 1 + test/booktests.py | 9 ++++++-- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/experimental/bookdata.py b/experimental/bookdata.py index 4cb4549a..75a390e0 100644 --- a/experimental/bookdata.py +++ b/experimental/bookdata.py @@ -6,8 +6,9 @@ from pprint import pprint from itertools import islice, izip, repeat import logging from xml.etree import ElementTree +import random - +random.seed() import sys, os @@ -44,6 +45,7 @@ RY_OLID = 'OL4264806A' SURFACING_WORK_OLID = 'OL675829W' SURFACING_EDITION_OLID = 'OL8075248M' SURFACING_ISBN = '9780446311076' +SURFACING_LT_WORK_ID = '18997' USER_AGENT = "rdhyee@gluejar.com" @@ -96,12 +98,17 @@ def lt_whatwork(isbn=None, title=None, author=None): http://www.librarything.com/blogs/thingology/2009/03/new-api-what-work/ """ logger.info("looking up at lt_whatwork (isbn, title, author): %s %s %s" ,isbn, title, author) - url = "http://www.librarything.com/api/whatwork.php?" - url = "http://www.librarything.com/api/thingISBN/%s" % isbn - xml = requests.get(url, headers={"User-Agent": USER_AGENT}).content + url = "http://www.librarything.com/api/whatwork.php" + params=dict([(k,v) for (k,v) in {'isbn':isbn, 'title':title, 'author':author}.items() if v is not None]) + + xml = requests.get(url, params=params, headers={"User-Agent": USER_AGENT}).content doc = ElementTree.fromstring(xml) - return [e.text for e in doc.findall('isbn')] + work = doc.find('work') + if work is not None: + return work.text + else: + return None def hathi_bib(id, id_type='isbn', detail_level='brief'): url = "http://catalog.hathitrust.org/api/volumes/brief/%s/%s.json" % (id_type, id) @@ -763,21 +770,30 @@ class GoogleBooksTest(TestCase): results = gb.volumeid(g_id, glossed=True) print results - - -class thingISBNTest(TestCase): +class LibraryThingTest(TestCase): def test_lt_isbn(self): + isbns = thingisbn(SURFACING_ISBN) # convert to isbn-13 isbns = map(lambda x: isbn_mod.ISBN(x).to_string('13'), isbns) - print isbns + self.assertTrue(SURFACING_ISBN in isbns) + + # grab a random ISBN from the list, issue another call and then check that the new list is the same + isbns1 = map(lambda x: isbn_mod.ISBN(x).to_string('13'), thingisbn(random.sample(isbns,1)[0])) + self.assertEqual(set(isbns), set(isbns1)) + def test_whatwork(self): + work_id = lt_whatwork(isbn=SURFACING_ISBN) + self.assertEqual(work_id, SURFACING_LT_WORK_ID) + work_id = lt_whatwork(title='Hamlet', author='Shakespeare') + self.assertEqual(work_id, '2199') + def suite(): #testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest] testcases = [] suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases]) - suites.addTest(GoogleBooksTest('test_volumeid')) + suites.addTest(LibraryThingTest('test_whatwork')) #suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment return suites diff --git a/experimental/gutenberg/gutenberg.py b/experimental/gutenberg/gutenberg.py index 49bb4e50..a645e7fd 100644 --- a/experimental/gutenberg/gutenberg.py +++ b/experimental/gutenberg/gutenberg.py @@ -90,6 +90,7 @@ def get_or_create(session, model, defaults=None, **kwargs): Base = declarative_base() + class SeedISBN(Base): __tablename__ = 'SeedISBN' diff --git a/test/booktests.py b/test/booktests.py index f11e1a11..d1e87773 100644 --- a/test/booktests.py +++ b/test/booktests.py @@ -89,14 +89,19 @@ def cluster_status(): """Look at the current Work, Edition instances to figure out what needs to be fixed""" results = OrderedDict([ ('number of Works', models.Work.objects.count()), + ('number of Editions', models.Edition.objects.count()) ('number of Edition that have both Google Books id and ISBNs', models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()), ('number of Editions with Google Books IDs but not ISBNs', models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()), - ]) - # Are there Edition without ISBNs? Are they all singletons? + # What needs to be done to recluster editions? + + # Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons? + + # identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with + # an ISBN tied to that Google Books ID) return results