Check current progress in so that I can focus on a change in the master branch to add missing isbns to Editions

pull/1/head
Raymond Yee 2012-02-15 16:06:40 -08:00
parent 8c397f4953
commit a8f1c157be
3 changed files with 34 additions and 12 deletions

View File

@ -6,8 +6,9 @@ from pprint import pprint
from itertools import islice, izip, repeat from itertools import islice, izip, repeat
import logging import logging
from xml.etree import ElementTree from xml.etree import ElementTree
import random
random.seed()
import sys, os import sys, os
@ -44,6 +45,7 @@ RY_OLID = 'OL4264806A'
SURFACING_WORK_OLID = 'OL675829W' SURFACING_WORK_OLID = 'OL675829W'
SURFACING_EDITION_OLID = 'OL8075248M' SURFACING_EDITION_OLID = 'OL8075248M'
SURFACING_ISBN = '9780446311076' SURFACING_ISBN = '9780446311076'
SURFACING_LT_WORK_ID = '18997'
USER_AGENT = "rdhyee@gluejar.com" USER_AGENT = "rdhyee@gluejar.com"
@ -96,12 +98,17 @@ def lt_whatwork(isbn=None, title=None, author=None):
http://www.librarything.com/blogs/thingology/2009/03/new-api-what-work/ http://www.librarything.com/blogs/thingology/2009/03/new-api-what-work/
""" """
logger.info("looking up at lt_whatwork (isbn, title, author): %s %s %s" ,isbn, title, author) logger.info("looking up at lt_whatwork (isbn, title, author): %s %s %s" ,isbn, title, author)
url = "http://www.librarything.com/api/whatwork.php?" url = "http://www.librarything.com/api/whatwork.php"
url = "http://www.librarything.com/api/thingISBN/%s" % isbn params=dict([(k,v) for (k,v) in {'isbn':isbn, 'title':title, 'author':author}.items() if v is not None])
xml = requests.get(url, headers={"User-Agent": USER_AGENT}).content
xml = requests.get(url, params=params, headers={"User-Agent": USER_AGENT}).content
doc = ElementTree.fromstring(xml) doc = ElementTree.fromstring(xml)
return [e.text for e in doc.findall('isbn')]
work = doc.find('work')
if work is not None:
return work.text
else:
return None
def hathi_bib(id, id_type='isbn', detail_level='brief'): def hathi_bib(id, id_type='isbn', detail_level='brief'):
url = "http://catalog.hathitrust.org/api/volumes/brief/%s/%s.json" % (id_type, id) url = "http://catalog.hathitrust.org/api/volumes/brief/%s/%s.json" % (id_type, id)
@ -763,21 +770,30 @@ class GoogleBooksTest(TestCase):
results = gb.volumeid(g_id, glossed=True) results = gb.volumeid(g_id, glossed=True)
print results print results
class LibraryThingTest(TestCase):
class thingISBNTest(TestCase):
def test_lt_isbn(self): def test_lt_isbn(self):
isbns = thingisbn(SURFACING_ISBN) isbns = thingisbn(SURFACING_ISBN)
# convert to isbn-13 # convert to isbn-13
isbns = map(lambda x: isbn_mod.ISBN(x).to_string('13'), isbns) isbns = map(lambda x: isbn_mod.ISBN(x).to_string('13'), isbns)
print isbns self.assertTrue(SURFACING_ISBN in isbns)
# grab a random ISBN from the list, issue another call and then check that the new list is the same
isbns1 = map(lambda x: isbn_mod.ISBN(x).to_string('13'), thingisbn(random.sample(isbns,1)[0]))
self.assertEqual(set(isbns), set(isbns1))
def test_whatwork(self):
work_id = lt_whatwork(isbn=SURFACING_ISBN)
self.assertEqual(work_id, SURFACING_LT_WORK_ID)
work_id = lt_whatwork(title='Hamlet', author='Shakespeare')
self.assertEqual(work_id, '2199')
def suite(): def suite():
#testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest] #testcases = [WorkMapperTest,FreebaseBooksTest, OpenLibraryTest,GoogleBooksTest]
testcases = [] testcases = []
suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases]) suites = unittest.TestSuite([unittest.TestLoader().loadTestsFromTestCase(testcase) for testcase in testcases])
suites.addTest(GoogleBooksTest('test_volumeid')) suites.addTest(LibraryThingTest('test_whatwork'))
#suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment #suites.addTest(SettingsTest('test_dev_me_alignment')) # give option to test this alignment
return suites return suites

View File

@ -90,6 +90,7 @@ def get_or_create(session, model, defaults=None, **kwargs):
Base = declarative_base() Base = declarative_base()
class SeedISBN(Base): class SeedISBN(Base):
__tablename__ = 'SeedISBN' __tablename__ = 'SeedISBN'

View File

@ -89,14 +89,19 @@ def cluster_status():
"""Look at the current Work, Edition instances to figure out what needs to be fixed""" """Look at the current Work, Edition instances to figure out what needs to be fixed"""
results = OrderedDict([ results = OrderedDict([
('number of Works', models.Work.objects.count()), ('number of Works', models.Work.objects.count()),
('number of Editions', models.Edition.objects.count())
('number of Edition that have both Google Books id and ISBNs', ('number of Edition that have both Google Books id and ISBNs',
models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()), models.Edition.objects.filter(identifiers__type='isbn').filter(identifiers__type='goog').count()),
('number of Editions with Google Books IDs but not ISBNs', ('number of Editions with Google Books IDs but not ISBNs',
models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()), models.Edition.objects.filter(identifiers__type='goog').exclude(identifiers__type='isbn').count()),
]) ])
# Are there Edition without ISBNs? Are they all singletons? # What needs to be done to recluster editions?
# Are there Edition without ISBNs? Look up the corresponding ISBNs from Google Books and Are they all singletons?
# identify Editions that should be merged (e.g., if one Edition has a Google Books ID and another Edition has one with
# an ISBN tied to that Google Books ID)
return results return results