diff --git a/core/librarything.py b/core/librarything.py index 9572e5f2..ba38f1fe 100644 --- a/core/librarything.py +++ b/core/librarything.py @@ -2,11 +2,12 @@ import csv import HTMLParser import httplib import logging -import mechanize import re +from datetime import datetime + +import mechanize import requests -from datetime import datetime from regluit.core import models logger = logging.getLogger(__name__) @@ -20,7 +21,7 @@ class LibraryThing(object): """ url = "https://www.librarything.com" csv_file_url = "https://www.librarything.com/export-csv" - + def __init__(self, username=None, password=None): self.username = username self.password = password @@ -40,77 +41,98 @@ class LibraryThing(object): def parse_csv(self): h = HTMLParser.HTMLParser() reader = csv.DictReader(self.csv_handle) - # There are more fields to be parsed out. Note that there is a second author column to handle - for (i,row) in enumerate(reader): + # There are more fields to be parsed out. Note that there is a + # second author column to handle + for (i, row) in enumerate(reader): # ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list m = re.match(r'^\[(.*)\]$', row["'ISBNs'"]) if m: isbn = m.group(1).split() else: isbn = [] - yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]), - 'isbn':isbn, 'comment':row["'COMMENT'"], - 'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"], - 'reviews':h.unescape(row["'REVIEWS'"])} + yield { + 'title':h.unescape(row["'TITLE'"]), + 'author':h.unescape(row["'AUTHOR (first, last)'"]), + 'isbn':isbn, + 'comment':row["'COMMENT'"], + 'tags':row["'TAGS'"], + 'collections':row["'COLLECTIONS'"], + 'reviews':h.unescape(row["'REVIEWS'"]) + } def viewstyle_1(self, rows): - - for (i,row) in enumerate(rows): + + for (i, row) in enumerate(rows): book_data = {} cols = row.xpath('td') # cover - book_data["cover"] = {"cover_id":cols[0].attrib["id"], - "image": {"width":cols[0].xpath('.//img')[0].attrib['width'], - "src": cols[0].xpath('.//img')[0].attrib['src']} + book_data["cover"] = { + "cover_id":cols[0].attrib["id"], + "image": { + "width":cols[0].xpath('.//img')[0].attrib['width'], + "src": cols[0].xpath('.//img')[0].attrib['src'] + } } # title - book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'], - "title":cols[1].xpath('.//a')[0].text} - + book_data["title"] = { + "href":cols[1].xpath('.//a')[0].attrib['href'], + "title":cols[1].xpath('.//a')[0].text + } + # extract work_id and book_id from href try: - (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups() + (book_data["work_id"], book_data["book_id"]) = re.match( + "^/work/(.*)/book/(.*)$", + book_data["title"]["href"] + ).groups() except: (book_data["work_id"], book_data["book_id"]) = (None, None) - + # author -- what if there is more than 1? or none? try: - book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text, - "href":cols[2].xpath('.//a')[0].attrib['href'], - "name":cols[2].xpath('div')[0].text} + book_data["author"] = { + "display_name":cols[2].xpath('.//a')[0].text, + "href":cols[2].xpath('.//a')[0].attrib['href'], + "name":cols[2].xpath('div')[0].text + } except: book_data["author"] = None - + # date book_data["date"] = cols[3].xpath('span')[0].text - + # tags: grab tags that are not empty strings tag_links = cols[4].xpath('.//a') book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links]) - + # rating -- count # of stars book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]')) - + # entry date - book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")) - + book_data["entry_date"] = datetime.date( + datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y") + ) + yield book_data - + def viewstyle_5(self, rows): # implement this view to get at the ISBNs - for (i,row) in enumerate(rows): + for (i, row) in enumerate(rows): book_data = {} cols = row.xpath('td') - + # title book_data["title"] = {"href":cols[0].xpath('.//a')[0].attrib['href'], "title":cols[0].xpath('.//a')[0].text} - + # extract work_id and book_id from href try: - (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups() + (book_data["work_id"], book_data["book_id"]) = re.match( + "^/work/(.*)/book/(.*)$", + book_data["title"]["href"] + ).groups() except: (book_data["work_id"], book_data["book_id"]) = (None, None) - + # tags tag_links = cols[1].xpath('.//a') book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links]) @@ -121,13 +143,13 @@ class LibraryThing(object): except Exception, e: logger.info("no lc call number for: %s %s", book_data["title"], e) book_data["lc_call_number"] = None - + # subject - + subjects = cols[3].xpath('.//div[@class="subjectLine"]') book_data["subjects"] = [{'href':s.xpath('a')[0].attrib['href'], 'text':s.xpath('a')[0].text} for s in subjects] - + # isbn try: book_data["isbn"] = cols[4].xpath('.//span')[0].text @@ -136,90 +158,94 @@ class LibraryThing(object): book_data["isbn"] = None except Exception, e: book_data["isbn"] = None - + yield book_data - + def parse_user_catalog(self, view_style=1): from lxml import html - + # we can vary viewstyle to get different info - - IMPLEMENTED_STYLES = [1,5] + + IMPLEMENTED_STYLES = [1, 5] COLLECTION = 2 # set to get All Collections - + if view_style not in IMPLEMENTED_STYLES: raise NotImplementedError() - style_parser = getattr(self,"viewstyle_%s" % view_style) + style_parser = getattr(self, "viewstyle_%s" % view_style) next_page = True offset = 0 cookies = None - + # go to the front page of LibraryThing first to pick up relevant session-like cookies r = requests.get("https://www.librarything.com/") cookies = r.cookies - + while next_page: - url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username, - view_style, COLLECTION, offset) + url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % ( + self.username, view_style, COLLECTION, offset + ) logger.info("url: %s", url) if cookies is None: r = requests.get(url) else: r = requests.get(url, cookies=cookies) - + if r.status_code != httplib.OK: - raise LibraryThingException("Error accessing %s: %s" % (url, e)) - logger.info("Error accessing %s: %s", url, e) + raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code)) etree = html.fromstring(r.content) - #logger.info("r.content %s", r.content) cookies = r.cookies # retain the cookies - + # look for a page bar # try to grab the total number of books # 1 - 50 of 82 try: count_text = etree.xpath('//td[@class="pbGroup"]')[0].text - total = int(re.search(r'(\d+)$',count_text).group(1)) + total = int(re.search(r'(\d+)$', count_text).group(1)) logger.info('total: %d', total) - except Exception, e: # assume for now that if we can't grab this text, there is no page bar and no books + except Exception, e: + # assume for now that if we can't grab this text, + # there is no page bar and no books logger.info('Exception {0}'.format(e)) total = 0 - - # to do paging we can either look for a next link or just increase the offset by the number of rows. + + # to do paging we can either look for a next link or just increase the offset + # by the number of rows. # Let's try the latter # possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0] - + rows_xpath = '//table[@id="lt_catalog_list"]/tbody/tr' - + # deal with page 1 first and then working on paging through the collection rows = etree.xpath(rows_xpath) - - i = -1 # have to account for the problem of style_parser(rows) returning nothing - - for (i,row) in enumerate(style_parser(rows)): - yield row - - # page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go - offset += i + 1 + i = -1 # have to account for the problem of style_parser(rows) returning nothing + + for (i, row) in enumerate(style_parser(rows)): + yield row + + # page size = 50, first page offset = 0, second page offset = 50 + # -- if total = 50 no need to go + + offset += i + 1 if offset >= total: next_page = False def load_librarything_into_wishlist(user, lt_username, max_books=None): """ - Load a specified LibraryThing shelf (by default: all the books from the LibraryThing account associated with user) + Load a specified LibraryThing shelf (by default: all the books + from the LibraryThing account associated with user) """ - + from regluit.core import bookloader from regluit.core import tasks from itertools import islice - + logger.info("Entering into load_librarything_into_wishlist") lt = LibraryThing(lt_username) - - - for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)): + + + for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)): isbn = book["isbn"] # grab the first one logger.info("%d %s %s", i, book["title"]["title"], isbn) try: @@ -229,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None): if not edition: continue # add the librarything ids to the db since we know them now - identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work) - identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work) + identifier = models.Identifier.get_or_add( + type='thng', + value=book['book_id'], + edition=edition, + work=edition.work + ) + identifier = models.Identifier.get_or_add( + type='ltwk', + value=book['work_id'], + work=edition.work + ) if book['lc_call_number']: - identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work) + identifier = models.Identifier.get_or_add( + type='lccn', + value=book['lc_call_number'], + edition=edition, + work=edition.work + ) user.wishlist.add_work(edition.work, 'librarything', notify=True) if edition.new: tasks.populate_edition.delay(edition.isbn_13) logger.info("Work with isbn %s added to wishlist.", isbn) except Exception, e: - logger.info ("error adding ISBN %s: %s", isbn, e) + logger.info("error adding ISBN %s: %s", isbn, e) diff --git a/core/management/commands/load_books_from_sitemap.py b/core/management/commands/load_books_from_sitemap.py index fbccd552..dcc886ad 100644 --- a/core/management/commands/load_books_from_sitemap.py +++ b/core/management/commands/load_books_from_sitemap.py @@ -30,9 +30,9 @@ class Command(BaseCommand): books = [] for sitemap in content: added = add_by_sitemap(sitemap.strip(), maxnum=max) - max = max - len(added) + max = max - len(added) if max else max books = books + added - if max < 0: + if max and max < 0: break else: books = add_by_sitemap(url, maxnum=max)