Merge remote-tracking branch 'Gluejar/master' into production
commit
d7f8c26882
|
@ -2,11 +2,12 @@ import csv
|
|||
import HTMLParser
|
||||
import httplib
|
||||
import logging
|
||||
import mechanize
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import mechanize
|
||||
import requests
|
||||
|
||||
from datetime import datetime
|
||||
from regluit.core import models
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -40,43 +41,59 @@ class LibraryThing(object):
|
|||
def parse_csv(self):
|
||||
h = HTMLParser.HTMLParser()
|
||||
reader = csv.DictReader(self.csv_handle)
|
||||
# There are more fields to be parsed out. Note that there is a second author column to handle
|
||||
for (i,row) in enumerate(reader):
|
||||
# There are more fields to be parsed out. Note that there is a
|
||||
# second author column to handle
|
||||
for (i, row) in enumerate(reader):
|
||||
# ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
|
||||
m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
|
||||
if m:
|
||||
isbn = m.group(1).split()
|
||||
else:
|
||||
isbn = []
|
||||
yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]),
|
||||
'isbn':isbn, 'comment':row["'COMMENT'"],
|
||||
'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"],
|
||||
'reviews':h.unescape(row["'REVIEWS'"])}
|
||||
yield {
|
||||
'title':h.unescape(row["'TITLE'"]),
|
||||
'author':h.unescape(row["'AUTHOR (first, last)'"]),
|
||||
'isbn':isbn,
|
||||
'comment':row["'COMMENT'"],
|
||||
'tags':row["'TAGS'"],
|
||||
'collections':row["'COLLECTIONS'"],
|
||||
'reviews':h.unescape(row["'REVIEWS'"])
|
||||
}
|
||||
def viewstyle_1(self, rows):
|
||||
|
||||
for (i,row) in enumerate(rows):
|
||||
for (i, row) in enumerate(rows):
|
||||
book_data = {}
|
||||
cols = row.xpath('td')
|
||||
# cover
|
||||
book_data["cover"] = {"cover_id":cols[0].attrib["id"],
|
||||
"image": {"width":cols[0].xpath('.//img')[0].attrib['width'],
|
||||
"src": cols[0].xpath('.//img')[0].attrib['src']}
|
||||
book_data["cover"] = {
|
||||
"cover_id":cols[0].attrib["id"],
|
||||
"image": {
|
||||
"width":cols[0].xpath('.//img')[0].attrib['width'],
|
||||
"src": cols[0].xpath('.//img')[0].attrib['src']
|
||||
}
|
||||
}
|
||||
# title
|
||||
book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'],
|
||||
"title":cols[1].xpath('.//a')[0].text}
|
||||
book_data["title"] = {
|
||||
"href":cols[1].xpath('.//a')[0].attrib['href'],
|
||||
"title":cols[1].xpath('.//a')[0].text
|
||||
}
|
||||
|
||||
# extract work_id and book_id from href
|
||||
try:
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match(
|
||||
"^/work/(.*)/book/(.*)$",
|
||||
book_data["title"]["href"]
|
||||
).groups()
|
||||
except:
|
||||
(book_data["work_id"], book_data["book_id"]) = (None, None)
|
||||
|
||||
# author -- what if there is more than 1? or none?
|
||||
try:
|
||||
book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text,
|
||||
"href":cols[2].xpath('.//a')[0].attrib['href'],
|
||||
"name":cols[2].xpath('div')[0].text}
|
||||
book_data["author"] = {
|
||||
"display_name":cols[2].xpath('.//a')[0].text,
|
||||
"href":cols[2].xpath('.//a')[0].attrib['href'],
|
||||
"name":cols[2].xpath('div')[0].text
|
||||
}
|
||||
except:
|
||||
book_data["author"] = None
|
||||
|
||||
|
@ -91,13 +108,15 @@ class LibraryThing(object):
|
|||
book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
|
||||
|
||||
# entry date
|
||||
book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y"))
|
||||
book_data["entry_date"] = datetime.date(
|
||||
datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
|
||||
)
|
||||
|
||||
yield book_data
|
||||
|
||||
def viewstyle_5(self, rows):
|
||||
# implement this view to get at the ISBNs
|
||||
for (i,row) in enumerate(rows):
|
||||
for (i, row) in enumerate(rows):
|
||||
book_data = {}
|
||||
cols = row.xpath('td')
|
||||
|
||||
|
@ -107,7 +126,10 @@ class LibraryThing(object):
|
|||
|
||||
# extract work_id and book_id from href
|
||||
try:
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match(
|
||||
"^/work/(.*)/book/(.*)$",
|
||||
book_data["title"]["href"]
|
||||
).groups()
|
||||
except:
|
||||
(book_data["work_id"], book_data["book_id"]) = (None, None)
|
||||
|
||||
|
@ -145,12 +167,12 @@ class LibraryThing(object):
|
|||
|
||||
# we can vary viewstyle to get different info
|
||||
|
||||
IMPLEMENTED_STYLES = [1,5]
|
||||
IMPLEMENTED_STYLES = [1, 5]
|
||||
COLLECTION = 2 # set to get All Collections
|
||||
|
||||
if view_style not in IMPLEMENTED_STYLES:
|
||||
raise NotImplementedError()
|
||||
style_parser = getattr(self,"viewstyle_%s" % view_style)
|
||||
style_parser = getattr(self, "viewstyle_%s" % view_style)
|
||||
next_page = True
|
||||
offset = 0
|
||||
cookies = None
|
||||
|
@ -160,8 +182,9 @@ class LibraryThing(object):
|
|||
cookies = r.cookies
|
||||
|
||||
while next_page:
|
||||
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username,
|
||||
view_style, COLLECTION, offset)
|
||||
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
|
||||
self.username, view_style, COLLECTION, offset
|
||||
)
|
||||
logger.info("url: %s", url)
|
||||
if cookies is None:
|
||||
r = requests.get(url)
|
||||
|
@ -169,10 +192,8 @@ class LibraryThing(object):
|
|||
r = requests.get(url, cookies=cookies)
|
||||
|
||||
if r.status_code != httplib.OK:
|
||||
raise LibraryThingException("Error accessing %s: %s" % (url, e))
|
||||
logger.info("Error accessing %s: %s", url, e)
|
||||
raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code))
|
||||
etree = html.fromstring(r.content)
|
||||
#logger.info("r.content %s", r.content)
|
||||
cookies = r.cookies # retain the cookies
|
||||
|
||||
# look for a page bar
|
||||
|
@ -180,13 +201,16 @@ class LibraryThing(object):
|
|||
# 1 - 50 of 82
|
||||
try:
|
||||
count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
|
||||
total = int(re.search(r'(\d+)$',count_text).group(1))
|
||||
total = int(re.search(r'(\d+)$', count_text).group(1))
|
||||
logger.info('total: %d', total)
|
||||
except Exception, e: # assume for now that if we can't grab this text, there is no page bar and no books
|
||||
except Exception, e:
|
||||
# assume for now that if we can't grab this text,
|
||||
# there is no page bar and no books
|
||||
logger.info('Exception {0}'.format(e))
|
||||
total = 0
|
||||
|
||||
# to do paging we can either look for a next link or just increase the offset by the number of rows.
|
||||
# to do paging we can either look for a next link or just increase the offset
|
||||
# by the number of rows.
|
||||
# Let's try the latter
|
||||
# possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
|
||||
|
||||
|
@ -197,10 +221,11 @@ class LibraryThing(object):
|
|||
|
||||
i = -1 # have to account for the problem of style_parser(rows) returning nothing
|
||||
|
||||
for (i,row) in enumerate(style_parser(rows)):
|
||||
for (i, row) in enumerate(style_parser(rows)):
|
||||
yield row
|
||||
|
||||
# page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go
|
||||
# page size = 50, first page offset = 0, second page offset = 50
|
||||
# -- if total = 50 no need to go
|
||||
|
||||
offset += i + 1
|
||||
if offset >= total:
|
||||
|
@ -208,7 +233,8 @@ class LibraryThing(object):
|
|||
|
||||
def load_librarything_into_wishlist(user, lt_username, max_books=None):
|
||||
"""
|
||||
Load a specified LibraryThing shelf (by default: all the books from the LibraryThing account associated with user)
|
||||
Load a specified LibraryThing shelf (by default: all the books
|
||||
from the LibraryThing account associated with user)
|
||||
"""
|
||||
|
||||
from regluit.core import bookloader
|
||||
|
@ -219,7 +245,7 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
|
|||
lt = LibraryThing(lt_username)
|
||||
|
||||
|
||||
for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)):
|
||||
for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)):
|
||||
isbn = book["isbn"] # grab the first one
|
||||
logger.info("%d %s %s", i, book["title"]["title"], isbn)
|
||||
try:
|
||||
|
@ -229,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
|
|||
if not edition:
|
||||
continue
|
||||
# add the librarything ids to the db since we know them now
|
||||
identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work)
|
||||
identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work)
|
||||
identifier = models.Identifier.get_or_add(
|
||||
type='thng',
|
||||
value=book['book_id'],
|
||||
edition=edition,
|
||||
work=edition.work
|
||||
)
|
||||
identifier = models.Identifier.get_or_add(
|
||||
type='ltwk',
|
||||
value=book['work_id'],
|
||||
work=edition.work
|
||||
)
|
||||
if book['lc_call_number']:
|
||||
identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work)
|
||||
identifier = models.Identifier.get_or_add(
|
||||
type='lccn',
|
||||
value=book['lc_call_number'],
|
||||
edition=edition,
|
||||
work=edition.work
|
||||
)
|
||||
user.wishlist.add_work(edition.work, 'librarything', notify=True)
|
||||
if edition.new:
|
||||
tasks.populate_edition.delay(edition.isbn_13)
|
||||
logger.info("Work with isbn %s added to wishlist.", isbn)
|
||||
except Exception, e:
|
||||
logger.info ("error adding ISBN %s: %s", isbn, e)
|
||||
logger.info("error adding ISBN %s: %s", isbn, e)
|
||||
|
|
|
@ -30,9 +30,9 @@ class Command(BaseCommand):
|
|||
books = []
|
||||
for sitemap in content:
|
||||
added = add_by_sitemap(sitemap.strip(), maxnum=max)
|
||||
max = max - len(added)
|
||||
max = max - len(added) if max else max
|
||||
books = books + added
|
||||
if max < 0:
|
||||
if max and max < 0:
|
||||
break
|
||||
else:
|
||||
books = add_by_sitemap(url, maxnum=max)
|
||||
|
|
Loading…
Reference in New Issue