delint

2018-02-22 11:29:52 -05:00 · 2018-02-22 11:29:52 -05:00 · 957cf615b4
parent 1425b29d49
commit 957cf615b4
1 changed files with 116 additions and 74 deletions
--- a/core/librarything.py
+++ b/core/librarything.py
@ -2,11 +2,12 @@ import csv
 import HTMLParser
 import httplib
 import logging
-import mechanize
 import re
+from datetime import datetime
+
+import mechanize
 import requests

-from datetime import datetime
 from regluit.core import models

 logger = logging.getLogger(__name__)
@ -20,7 +21,7 @@ class LibraryThing(object):
    """
    url = "https://www.librarything.com"
    csv_file_url = "https://www.librarything.com/export-csv"
-    
+
    def __init__(self, username=None, password=None):
        self.username = username
        self.password = password
@ -40,77 +41,98 @@ class LibraryThing(object):
    def parse_csv(self):
        h = HTMLParser.HTMLParser()
        reader = csv.DictReader(self.csv_handle)
-        # There are more fields to be parsed out.  Note that there is a second author column to handle
-        for (i,row) in enumerate(reader):
+        # There are more fields to be parsed out.  Note that there is a
+        # second author column to handle
+        for (i, row) in enumerate(reader):
            # ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
            m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
            if m:
                isbn = m.group(1).split()
            else:
                isbn = []
-            yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]),
-                   'isbn':isbn, 'comment':row["'COMMENT'"],
-                   'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"],
-                    'reviews':h.unescape(row["'REVIEWS'"])}
+            yield {
+                'title':h.unescape(row["'TITLE'"]),
+                'author':h.unescape(row["'AUTHOR (first, last)'"]),
+                'isbn':isbn,
+                'comment':row["'COMMENT'"],
+                'tags':row["'TAGS'"],
+                'collections':row["'COLLECTIONS'"],
+                'reviews':h.unescape(row["'REVIEWS'"])
+            }
    def viewstyle_1(self, rows):
-        
-        for (i,row) in enumerate(rows):
+
+        for (i, row) in enumerate(rows):
            book_data = {}
            cols = row.xpath('td')
            # cover
-            book_data["cover"] = {"cover_id":cols[0].attrib["id"],
-                                  "image": {"width":cols[0].xpath('.//img')[0].attrib['width'],
-                                    "src": cols[0].xpath('.//img')[0].attrib['src']}
+            book_data["cover"] = {
+                "cover_id":cols[0].attrib["id"],
+                "image": {
+                    "width":cols[0].xpath('.//img')[0].attrib['width'],
+                    "src": cols[0].xpath('.//img')[0].attrib['src']
+                }
            }
            # title
-            book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'],
-                                  "title":cols[1].xpath('.//a')[0].text}
-            
+            book_data["title"] = {
+                "href":cols[1].xpath('.//a')[0].attrib['href'],
+                "title":cols[1].xpath('.//a')[0].text
+            }
+
            # extract work_id and book_id from href
            try:
-                (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
+                (book_data["work_id"], book_data["book_id"]) = re.match(
+                    "^/work/(.*)/book/(.*)$",
+                    book_data["title"]["href"]
+                ).groups()
            except:
                (book_data["work_id"], book_data["book_id"]) = (None, None)
-                
+
            # author -- what if there is more than 1?  or none?
            try:
-                book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text,
-                                       "href":cols[2].xpath('.//a')[0].attrib['href'],
-                                       "name":cols[2].xpath('div')[0].text}
+                book_data["author"] = {
+                    "display_name":cols[2].xpath('.//a')[0].text,
+                    "href":cols[2].xpath('.//a')[0].attrib['href'],
+                    "name":cols[2].xpath('div')[0].text
+                }
            except:
                book_data["author"] = None
-                
+
            # date
            book_data["date"] = cols[3].xpath('span')[0].text
-            
+
            # tags: grab tags that are not empty strings
            tag_links = cols[4].xpath('.//a')
            book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
-            
+
            # rating -- count # of stars
            book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
-            
+
            # entry date
-            book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y"))
-            
+            book_data["entry_date"] = datetime.date(
+                datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
+            )
+
            yield book_data
-            
+
    def viewstyle_5(self, rows):
        # implement this view to get at the ISBNs
-        for (i,row) in enumerate(rows):
+        for (i, row) in enumerate(rows):
            book_data = {}
            cols = row.xpath('td')
-            
+
            # title
            book_data["title"] = {"href":cols[0].xpath('.//a')[0].attrib['href'],
                                  "title":cols[0].xpath('.//a')[0].text}
-            
+
            # extract work_id and book_id from href
            try:
-                (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
+                (book_data["work_id"], book_data["book_id"]) = re.match(
+                    "^/work/(.*)/book/(.*)$",
+                    book_data["title"]["href"]
+                ).groups()
            except:
                (book_data["work_id"], book_data["book_id"]) = (None, None)
-            
+
            # tags
            tag_links = cols[1].xpath('.//a')
            book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
@ -121,13 +143,13 @@ class LibraryThing(object):
            except Exception, e:
                logger.info("no lc call number for: %s %s", book_data["title"], e)
                book_data["lc_call_number"] = None
-                
+
            # subject
-            
+
            subjects = cols[3].xpath('.//div[@class="subjectLine"]')
            book_data["subjects"] = [{'href':s.xpath('a')[0].attrib['href'],
                                      'text':s.xpath('a')[0].text} for s in subjects]
-            
+
            # isbn
            try:
                book_data["isbn"] = cols[4].xpath('.//span')[0].text
@ -136,88 +158,94 @@ class LibraryThing(object):
                    book_data["isbn"] = None
            except Exception, e:
                book_data["isbn"] = None
-            
+
            yield book_data

-        
+
    def parse_user_catalog(self, view_style=1):
        from lxml import html
-        
+
        # we can vary viewstyle to get different info
-        
-        IMPLEMENTED_STYLES = [1,5]
+
+        IMPLEMENTED_STYLES = [1, 5]
        COLLECTION = 2 # set to get All Collections
-        
+
        if view_style not in IMPLEMENTED_STYLES:
            raise NotImplementedError()
-        style_parser = getattr(self,"viewstyle_%s" % view_style)
+        style_parser = getattr(self, "viewstyle_%s" % view_style)
        next_page = True
        offset = 0
        cookies = None
-                
+
        # go to the front page of LibraryThing first to pick up relevant session-like cookies
        r = requests.get("https://www.librarything.com/")
        cookies = r.cookies
-        
+
        while next_page:
-            url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username,
-                                        view_style, COLLECTION, offset)
+            url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
+                self.username, view_style, COLLECTION, offset
+            )
            logger.info("url: %s", url)
            if cookies is None:
                r = requests.get(url)
            else:
                r = requests.get(url, cookies=cookies)
-                
+
            if r.status_code != httplib.OK:
                raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code))
            etree = html.fromstring(r.content)
            cookies = r.cookies  # retain the cookies
-            
+
            # look for a page bar
            # try to grab the total number of books
            # 1 - 50 of 82
            try:
                count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
-                total = int(re.search(r'(\d+)$',count_text).group(1))
+                total = int(re.search(r'(\d+)$', count_text).group(1))
                logger.info('total: %d', total)
-            except Exception, e:  # assume for now that if we can't grab this text, there is no page bar and no books
+            except Exception, e:
+                # assume for now that if we can't grab this text,
+                # there is no page bar and no books
                logger.info('Exception {0}'.format(e))
                total = 0
-                
-            # to do paging we can either look for a next link or just increase the offset by the number of rows.
+
+            # to do paging we can either look for a next link or just increase the offset
+            # by the number of rows.
            # Let's try the latter
            # possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
-                        
+
            rows_xpath = '//table[@id="lt_catalog_list"]/tbody/tr'
-        
+
            # deal with page 1 first and then working on paging through the collection
            rows = etree.xpath(rows_xpath)
-            
-            i = -1 # have to account for the problem of style_parser(rows) returning nothing
-        
-            for (i,row) in enumerate(style_parser(rows)):
-                yield row
-                
-            # page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go

-            offset += i + 1  
+            i = -1 # have to account for the problem of style_parser(rows) returning nothing
+
+            for (i, row) in enumerate(style_parser(rows)):
+                yield row
+
+            # page size = 50, first page offset = 0, second page offset = 50
+            # -- if total = 50 no need to go
+
+            offset += i + 1
            if offset >= total:
                next_page = False

 def load_librarything_into_wishlist(user, lt_username, max_books=None):
    """
-    Load a specified LibraryThing shelf (by default:  all the books from the LibraryThing account associated with user)
+    Load a specified LibraryThing shelf (by default:  all the books
+    from the LibraryThing account associated with user)
    """
-   
+
    from regluit.core import bookloader
    from regluit.core import tasks
    from itertools import islice
-    
+
    logger.info("Entering into load_librarything_into_wishlist")
    lt = LibraryThing(lt_username)
-    
-    
-    for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)):
+
+
+    for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)):
        isbn = book["isbn"]  # grab the first one
        logger.info("%d %s %s", i, book["title"]["title"], isbn)
        try:
@ -227,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
            if not edition:
                continue
            # add the librarything ids to the db since we know them now
-            identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work)
-            identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work)
+            identifier = models.Identifier.get_or_add(
+                type='thng',
+                value=book['book_id'],
+                edition=edition,
+                work=edition.work
+            )
+            identifier = models.Identifier.get_or_add(
+                type='ltwk',
+                value=book['work_id'],
+                work=edition.work
+            )
            if book['lc_call_number']:
-                identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work)
+                identifier = models.Identifier.get_or_add(
+                    type='lccn',
+                    value=book['lc_call_number'],
+                    edition=edition,
+                    work=edition.work
+                )
            user.wishlist.add_work(edition.work, 'librarything', notify=True)
            if edition.new:
                tasks.populate_edition.delay(edition.isbn_13)
            logger.info("Work with isbn %s added to wishlist.", isbn)
        except Exception, e:
-            logger.info ("error adding ISBN %s: %s", isbn, e)             
+            logger.info("error adding ISBN %s: %s", isbn, e)