Merge remote-tracking branch 'Gluejar/master' into production

pull/91/head
eric 2018-02-22 12:04:30 -05:00
commit d7f8c26882
2 changed files with 119 additions and 79 deletions

View File

@ -2,11 +2,12 @@ import csv
import HTMLParser import HTMLParser
import httplib import httplib
import logging import logging
import mechanize
import re import re
from datetime import datetime
import mechanize
import requests import requests
from datetime import datetime
from regluit.core import models from regluit.core import models
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -40,43 +41,59 @@ class LibraryThing(object):
def parse_csv(self): def parse_csv(self):
h = HTMLParser.HTMLParser() h = HTMLParser.HTMLParser()
reader = csv.DictReader(self.csv_handle) reader = csv.DictReader(self.csv_handle)
# There are more fields to be parsed out. Note that there is a second author column to handle # There are more fields to be parsed out. Note that there is a
for (i,row) in enumerate(reader): # second author column to handle
for (i, row) in enumerate(reader):
# ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list # ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
m = re.match(r'^\[(.*)\]$', row["'ISBNs'"]) m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
if m: if m:
isbn = m.group(1).split() isbn = m.group(1).split()
else: else:
isbn = [] isbn = []
yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]), yield {
'isbn':isbn, 'comment':row["'COMMENT'"], 'title':h.unescape(row["'TITLE'"]),
'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"], 'author':h.unescape(row["'AUTHOR (first, last)'"]),
'reviews':h.unescape(row["'REVIEWS'"])} 'isbn':isbn,
'comment':row["'COMMENT'"],
'tags':row["'TAGS'"],
'collections':row["'COLLECTIONS'"],
'reviews':h.unescape(row["'REVIEWS'"])
}
def viewstyle_1(self, rows): def viewstyle_1(self, rows):
for (i,row) in enumerate(rows): for (i, row) in enumerate(rows):
book_data = {} book_data = {}
cols = row.xpath('td') cols = row.xpath('td')
# cover # cover
book_data["cover"] = {"cover_id":cols[0].attrib["id"], book_data["cover"] = {
"image": {"width":cols[0].xpath('.//img')[0].attrib['width'], "cover_id":cols[0].attrib["id"],
"src": cols[0].xpath('.//img')[0].attrib['src']} "image": {
"width":cols[0].xpath('.//img')[0].attrib['width'],
"src": cols[0].xpath('.//img')[0].attrib['src']
}
} }
# title # title
book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'], book_data["title"] = {
"title":cols[1].xpath('.//a')[0].text} "href":cols[1].xpath('.//a')[0].attrib['href'],
"title":cols[1].xpath('.//a')[0].text
}
# extract work_id and book_id from href # extract work_id and book_id from href
try: try:
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups() (book_data["work_id"], book_data["book_id"]) = re.match(
"^/work/(.*)/book/(.*)$",
book_data["title"]["href"]
).groups()
except: except:
(book_data["work_id"], book_data["book_id"]) = (None, None) (book_data["work_id"], book_data["book_id"]) = (None, None)
# author -- what if there is more than 1? or none? # author -- what if there is more than 1? or none?
try: try:
book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text, book_data["author"] = {
"href":cols[2].xpath('.//a')[0].attrib['href'], "display_name":cols[2].xpath('.//a')[0].text,
"name":cols[2].xpath('div')[0].text} "href":cols[2].xpath('.//a')[0].attrib['href'],
"name":cols[2].xpath('div')[0].text
}
except: except:
book_data["author"] = None book_data["author"] = None
@ -91,13 +108,15 @@ class LibraryThing(object):
book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]')) book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
# entry date # entry date
book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")) book_data["entry_date"] = datetime.date(
datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
)
yield book_data yield book_data
def viewstyle_5(self, rows): def viewstyle_5(self, rows):
# implement this view to get at the ISBNs # implement this view to get at the ISBNs
for (i,row) in enumerate(rows): for (i, row) in enumerate(rows):
book_data = {} book_data = {}
cols = row.xpath('td') cols = row.xpath('td')
@ -107,7 +126,10 @@ class LibraryThing(object):
# extract work_id and book_id from href # extract work_id and book_id from href
try: try:
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups() (book_data["work_id"], book_data["book_id"]) = re.match(
"^/work/(.*)/book/(.*)$",
book_data["title"]["href"]
).groups()
except: except:
(book_data["work_id"], book_data["book_id"]) = (None, None) (book_data["work_id"], book_data["book_id"]) = (None, None)
@ -145,12 +167,12 @@ class LibraryThing(object):
# we can vary viewstyle to get different info # we can vary viewstyle to get different info
IMPLEMENTED_STYLES = [1,5] IMPLEMENTED_STYLES = [1, 5]
COLLECTION = 2 # set to get All Collections COLLECTION = 2 # set to get All Collections
if view_style not in IMPLEMENTED_STYLES: if view_style not in IMPLEMENTED_STYLES:
raise NotImplementedError() raise NotImplementedError()
style_parser = getattr(self,"viewstyle_%s" % view_style) style_parser = getattr(self, "viewstyle_%s" % view_style)
next_page = True next_page = True
offset = 0 offset = 0
cookies = None cookies = None
@ -160,8 +182,9 @@ class LibraryThing(object):
cookies = r.cookies cookies = r.cookies
while next_page: while next_page:
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username, url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
view_style, COLLECTION, offset) self.username, view_style, COLLECTION, offset
)
logger.info("url: %s", url) logger.info("url: %s", url)
if cookies is None: if cookies is None:
r = requests.get(url) r = requests.get(url)
@ -169,10 +192,8 @@ class LibraryThing(object):
r = requests.get(url, cookies=cookies) r = requests.get(url, cookies=cookies)
if r.status_code != httplib.OK: if r.status_code != httplib.OK:
raise LibraryThingException("Error accessing %s: %s" % (url, e)) raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code))
logger.info("Error accessing %s: %s", url, e)
etree = html.fromstring(r.content) etree = html.fromstring(r.content)
#logger.info("r.content %s", r.content)
cookies = r.cookies # retain the cookies cookies = r.cookies # retain the cookies
# look for a page bar # look for a page bar
@ -180,13 +201,16 @@ class LibraryThing(object):
# 1 - 50 of 82 # 1 - 50 of 82
try: try:
count_text = etree.xpath('//td[@class="pbGroup"]')[0].text count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
total = int(re.search(r'(\d+)$',count_text).group(1)) total = int(re.search(r'(\d+)$', count_text).group(1))
logger.info('total: %d', total) logger.info('total: %d', total)
except Exception, e: # assume for now that if we can't grab this text, there is no page bar and no books except Exception, e:
# assume for now that if we can't grab this text,
# there is no page bar and no books
logger.info('Exception {0}'.format(e)) logger.info('Exception {0}'.format(e))
total = 0 total = 0
# to do paging we can either look for a next link or just increase the offset by the number of rows. # to do paging we can either look for a next link or just increase the offset
# by the number of rows.
# Let's try the latter # Let's try the latter
# possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0] # possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
@ -197,10 +221,11 @@ class LibraryThing(object):
i = -1 # have to account for the problem of style_parser(rows) returning nothing i = -1 # have to account for the problem of style_parser(rows) returning nothing
for (i,row) in enumerate(style_parser(rows)): for (i, row) in enumerate(style_parser(rows)):
yield row yield row
# page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go # page size = 50, first page offset = 0, second page offset = 50
# -- if total = 50 no need to go
offset += i + 1 offset += i + 1
if offset >= total: if offset >= total:
@ -208,7 +233,8 @@ class LibraryThing(object):
def load_librarything_into_wishlist(user, lt_username, max_books=None): def load_librarything_into_wishlist(user, lt_username, max_books=None):
""" """
Load a specified LibraryThing shelf (by default: all the books from the LibraryThing account associated with user) Load a specified LibraryThing shelf (by default: all the books
from the LibraryThing account associated with user)
""" """
from regluit.core import bookloader from regluit.core import bookloader
@ -219,7 +245,7 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
lt = LibraryThing(lt_username) lt = LibraryThing(lt_username)
for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)): for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)):
isbn = book["isbn"] # grab the first one isbn = book["isbn"] # grab the first one
logger.info("%d %s %s", i, book["title"]["title"], isbn) logger.info("%d %s %s", i, book["title"]["title"], isbn)
try: try:
@ -229,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
if not edition: if not edition:
continue continue
# add the librarything ids to the db since we know them now # add the librarything ids to the db since we know them now
identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work) identifier = models.Identifier.get_or_add(
identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work) type='thng',
value=book['book_id'],
edition=edition,
work=edition.work
)
identifier = models.Identifier.get_or_add(
type='ltwk',
value=book['work_id'],
work=edition.work
)
if book['lc_call_number']: if book['lc_call_number']:
identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work) identifier = models.Identifier.get_or_add(
type='lccn',
value=book['lc_call_number'],
edition=edition,
work=edition.work
)
user.wishlist.add_work(edition.work, 'librarything', notify=True) user.wishlist.add_work(edition.work, 'librarything', notify=True)
if edition.new: if edition.new:
tasks.populate_edition.delay(edition.isbn_13) tasks.populate_edition.delay(edition.isbn_13)
logger.info("Work with isbn %s added to wishlist.", isbn) logger.info("Work with isbn %s added to wishlist.", isbn)
except Exception, e: except Exception, e:
logger.info ("error adding ISBN %s: %s", isbn, e) logger.info("error adding ISBN %s: %s", isbn, e)

View File

@ -30,9 +30,9 @@ class Command(BaseCommand):
books = [] books = []
for sitemap in content: for sitemap in content:
added = add_by_sitemap(sitemap.strip(), maxnum=max) added = add_by_sitemap(sitemap.strip(), maxnum=max)
max = max - len(added) max = max - len(added) if max else max
books = books + added books = books + added
if max < 0: if max and max < 0:
break break
else: else:
books = add_by_sitemap(url, maxnum=max) books = add_by_sitemap(url, maxnum=max)