Merge remote-tracking branch 'Gluejar/master' into production

2018-02-22 12:04:30 -05:00 · 2018-02-22 12:04:30 -05:00 · d7f8c26882
parent a6def2171d 6a2d1a074b
commit d7f8c26882
2 changed files with 119 additions and 79 deletions
--- a/core/librarything.py
+++ b/core/librarything.py
@ -2,11 +2,12 @@ import csv
 import HTMLParser
 import httplib
 import logging
 import mechanize
 import re
 from datetime import datetime
 import mechanize
 import requests
 from datetime import datetime
 from regluit.core import models
 logger = logging.getLogger(__name__)
@ -40,43 +41,59 @@ class LibraryThing(object):
    def parse_csv(self):
        h = HTMLParser.HTMLParser()
        reader = csv.DictReader(self.csv_handle)
-        # There are more fields to be parsed out.  Note that there is a second author column to handle
+        # There are more fields to be parsed out.  Note that there is a
-        for (i,row) in enumerate(reader):
+        # second author column to handle
        for (i, row) in enumerate(reader):
            # ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
            m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
            if m:
                isbn = m.group(1).split()
            else:
                isbn = []
-            yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]),
+            yield {
-                   'isbn':isbn, 'comment':row["'COMMENT'"],
+                'title':h.unescape(row["'TITLE'"]),
-                   'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"],
+                'author':h.unescape(row["'AUTHOR (first, last)'"]),
-                    'reviews':h.unescape(row["'REVIEWS'"])}
+                'isbn':isbn,
                'comment':row["'COMMENT'"],
                'tags':row["'TAGS'"],
                'collections':row["'COLLECTIONS'"],
                'reviews':h.unescape(row["'REVIEWS'"])
            }
    def viewstyle_1(self, rows):
-        for (i,row) in enumerate(rows):
+        for (i, row) in enumerate(rows):
            book_data = {}
            cols = row.xpath('td')
            # cover
-            book_data["cover"] = {"cover_id":cols[0].attrib["id"],
+            book_data["cover"] = {
-                                  "image": {"width":cols[0].xpath('.//img')[0].attrib['width'],
+                "cover_id":cols[0].attrib["id"],
-                                    "src": cols[0].xpath('.//img')[0].attrib['src']}
+                "image": {
                    "width":cols[0].xpath('.//img')[0].attrib['width'],
                    "src": cols[0].xpath('.//img')[0].attrib['src']
                }
            }
            # title
-            book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'],
+            book_data["title"] = {
-                                  "title":cols[1].xpath('.//a')[0].text}
+                "href":cols[1].xpath('.//a')[0].attrib['href'],
                "title":cols[1].xpath('.//a')[0].text
            }
            # extract work_id and book_id from href
            try:
-                (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
+                (book_data["work_id"], book_data["book_id"]) = re.match(
                    "^/work/(.*)/book/(.*)$",
                    book_data["title"]["href"]
                ).groups()
            except:
                (book_data["work_id"], book_data["book_id"]) = (None, None)
            # author -- what if there is more than 1?  or none?
            try:
-                book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text,
+                book_data["author"] = {
-                                       "href":cols[2].xpath('.//a')[0].attrib['href'],
+                    "display_name":cols[2].xpath('.//a')[0].text,
-                                       "name":cols[2].xpath('div')[0].text}
+                    "href":cols[2].xpath('.//a')[0].attrib['href'],
                    "name":cols[2].xpath('div')[0].text
                }
            except:
                book_data["author"] = None
@ -91,13 +108,15 @@ class LibraryThing(object):
            book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
            # entry date
-            book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y"))
+            book_data["entry_date"] = datetime.date(
                datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
            )
            yield book_data
    def viewstyle_5(self, rows):
        # implement this view to get at the ISBNs
-        for (i,row) in enumerate(rows):
+        for (i, row) in enumerate(rows):
            book_data = {}
            cols = row.xpath('td')
@ -107,7 +126,10 @@ class LibraryThing(object):
            # extract work_id and book_id from href
            try:
-                (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
+                (book_data["work_id"], book_data["book_id"]) = re.match(
                    "^/work/(.*)/book/(.*)$",
                    book_data["title"]["href"]
                ).groups()
            except:
                (book_data["work_id"], book_data["book_id"]) = (None, None)
@ -145,12 +167,12 @@ class LibraryThing(object):
        # we can vary viewstyle to get different info
-        IMPLEMENTED_STYLES = [1,5]
+        IMPLEMENTED_STYLES = [1, 5]
        COLLECTION = 2 # set to get All Collections
        if view_style not in IMPLEMENTED_STYLES:
            raise NotImplementedError()
-        style_parser = getattr(self,"viewstyle_%s" % view_style)
+        style_parser = getattr(self, "viewstyle_%s" % view_style)
        next_page = True
        offset = 0
        cookies = None
@ -160,8 +182,9 @@ class LibraryThing(object):
        cookies = r.cookies
        while next_page:
-            url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username,
+            url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
-                                        view_style, COLLECTION, offset)
+                self.username, view_style, COLLECTION, offset
            )
            logger.info("url: %s", url)
            if cookies is None:
                r = requests.get(url)
@ -169,10 +192,8 @@ class LibraryThing(object):
                r = requests.get(url, cookies=cookies)
            if r.status_code != httplib.OK:
-                raise LibraryThingException("Error accessing %s: %s" % (url, e))
+                raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code))
                logger.info("Error accessing %s: %s", url, e)
            etree = html.fromstring(r.content)
            #logger.info("r.content %s", r.content)
            cookies = r.cookies  # retain the cookies
            # look for a page bar
@ -180,13 +201,16 @@ class LibraryThing(object):
            # 1 - 50 of 82
            try:
                count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
-                total = int(re.search(r'(\d+)$',count_text).group(1))
+                total = int(re.search(r'(\d+)$', count_text).group(1))
                logger.info('total: %d', total)
-            except Exception, e:  # assume for now that if we can't grab this text, there is no page bar and no books
+            except Exception, e:
                # assume for now that if we can't grab this text,
                # there is no page bar and no books
                logger.info('Exception {0}'.format(e))
                total = 0
-            # to do paging we can either look for a next link or just increase the offset by the number of rows.
+            # to do paging we can either look for a next link or just increase the offset
            # by the number of rows.
            # Let's try the latter
            # possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
@ -197,10 +221,11 @@ class LibraryThing(object):
            i = -1 # have to account for the problem of style_parser(rows) returning nothing
-            for (i,row) in enumerate(style_parser(rows)):
+            for (i, row) in enumerate(style_parser(rows)):
                yield row
-            # page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go
+            # page size = 50, first page offset = 0, second page offset = 50
            # -- if total = 50 no need to go
            offset += i + 1
            if offset >= total:
@ -208,7 +233,8 @@ class LibraryThing(object):
 def load_librarything_into_wishlist(user, lt_username, max_books=None):
    """
-    Load a specified LibraryThing shelf (by default:  all the books from the LibraryThing account associated with user)
+    Load a specified LibraryThing shelf (by default:  all the books
    from the LibraryThing account associated with user)
    """
    from regluit.core import bookloader
@ -219,7 +245,7 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
    lt = LibraryThing(lt_username)
-    for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)):
+    for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)):
        isbn = book["isbn"]  # grab the first one
        logger.info("%d %s %s", i, book["title"]["title"], isbn)
        try:
@ -229,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
            if not edition:
                continue
            # add the librarything ids to the db since we know them now
-            identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work)
+            identifier = models.Identifier.get_or_add(
-            identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work)
+                type='thng',
                value=book['book_id'],
                edition=edition,
                work=edition.work
            )
            identifier = models.Identifier.get_or_add(
                type='ltwk',
                value=book['work_id'],
                work=edition.work
            )
            if book['lc_call_number']:
-                identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work)
+                identifier = models.Identifier.get_or_add(
                    type='lccn',
                    value=book['lc_call_number'],
                    edition=edition,
                    work=edition.work
                )
            user.wishlist.add_work(edition.work, 'librarything', notify=True)
            if edition.new:
                tasks.populate_edition.delay(edition.isbn_13)
            logger.info("Work with isbn %s added to wishlist.", isbn)
        except Exception, e:
-            logger.info ("error adding ISBN %s: %s", isbn, e)             
+            logger.info("error adding ISBN %s: %s", isbn, e)
--- a/core/management/commands/load_books_from_sitemap.py
+++ b/core/management/commands/load_books_from_sitemap.py
@ -30,9 +30,9 @@ class Command(BaseCommand):
            books = []
            for sitemap in content:
                added = add_by_sitemap(sitemap.strip(), maxnum=max)
-                max = max - len(added)
+                max = max - len(added) if max else max
                books =  books + added
-                if max < 0:
+                if max and max < 0:
                    break
        else:
            books = add_by_sitemap(url, maxnum=max)