pull/91/head
eric 2018-02-22 11:29:52 -05:00
parent 1425b29d49
commit 957cf615b4
1 changed files with 116 additions and 74 deletions

View File

@ -2,11 +2,12 @@ import csv
import HTMLParser
import httplib
import logging
import mechanize
import re
from datetime import datetime
import mechanize
import requests
from datetime import datetime
from regluit.core import models
logger = logging.getLogger(__name__)
@ -20,7 +21,7 @@ class LibraryThing(object):
"""
url = "https://www.librarything.com"
csv_file_url = "https://www.librarything.com/export-csv"
def __init__(self, username=None, password=None):
self.username = username
self.password = password
@ -40,77 +41,98 @@ class LibraryThing(object):
def parse_csv(self):
h = HTMLParser.HTMLParser()
reader = csv.DictReader(self.csv_handle)
# There are more fields to be parsed out. Note that there is a second author column to handle
for (i,row) in enumerate(reader):
# There are more fields to be parsed out. Note that there is a
# second author column to handle
for (i, row) in enumerate(reader):
# ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
if m:
isbn = m.group(1).split()
else:
isbn = []
yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]),
'isbn':isbn, 'comment':row["'COMMENT'"],
'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"],
'reviews':h.unescape(row["'REVIEWS'"])}
yield {
'title':h.unescape(row["'TITLE'"]),
'author':h.unescape(row["'AUTHOR (first, last)'"]),
'isbn':isbn,
'comment':row["'COMMENT'"],
'tags':row["'TAGS'"],
'collections':row["'COLLECTIONS'"],
'reviews':h.unescape(row["'REVIEWS'"])
}
def viewstyle_1(self, rows):
for (i,row) in enumerate(rows):
for (i, row) in enumerate(rows):
book_data = {}
cols = row.xpath('td')
# cover
book_data["cover"] = {"cover_id":cols[0].attrib["id"],
"image": {"width":cols[0].xpath('.//img')[0].attrib['width'],
"src": cols[0].xpath('.//img')[0].attrib['src']}
book_data["cover"] = {
"cover_id":cols[0].attrib["id"],
"image": {
"width":cols[0].xpath('.//img')[0].attrib['width'],
"src": cols[0].xpath('.//img')[0].attrib['src']
}
}
# title
book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'],
"title":cols[1].xpath('.//a')[0].text}
book_data["title"] = {
"href":cols[1].xpath('.//a')[0].attrib['href'],
"title":cols[1].xpath('.//a')[0].text
}
# extract work_id and book_id from href
try:
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
(book_data["work_id"], book_data["book_id"]) = re.match(
"^/work/(.*)/book/(.*)$",
book_data["title"]["href"]
).groups()
except:
(book_data["work_id"], book_data["book_id"]) = (None, None)
# author -- what if there is more than 1? or none?
try:
book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text,
"href":cols[2].xpath('.//a')[0].attrib['href'],
"name":cols[2].xpath('div')[0].text}
book_data["author"] = {
"display_name":cols[2].xpath('.//a')[0].text,
"href":cols[2].xpath('.//a')[0].attrib['href'],
"name":cols[2].xpath('div')[0].text
}
except:
book_data["author"] = None
# date
book_data["date"] = cols[3].xpath('span')[0].text
# tags: grab tags that are not empty strings
tag_links = cols[4].xpath('.//a')
book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
# rating -- count # of stars
book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
# entry date
book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y"))
book_data["entry_date"] = datetime.date(
datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
)
yield book_data
def viewstyle_5(self, rows):
# implement this view to get at the ISBNs
for (i,row) in enumerate(rows):
for (i, row) in enumerate(rows):
book_data = {}
cols = row.xpath('td')
# title
book_data["title"] = {"href":cols[0].xpath('.//a')[0].attrib['href'],
"title":cols[0].xpath('.//a')[0].text}
# extract work_id and book_id from href
try:
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
(book_data["work_id"], book_data["book_id"]) = re.match(
"^/work/(.*)/book/(.*)$",
book_data["title"]["href"]
).groups()
except:
(book_data["work_id"], book_data["book_id"]) = (None, None)
# tags
tag_links = cols[1].xpath('.//a')
book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
@ -121,13 +143,13 @@ class LibraryThing(object):
except Exception, e:
logger.info("no lc call number for: %s %s", book_data["title"], e)
book_data["lc_call_number"] = None
# subject
subjects = cols[3].xpath('.//div[@class="subjectLine"]')
book_data["subjects"] = [{'href':s.xpath('a')[0].attrib['href'],
'text':s.xpath('a')[0].text} for s in subjects]
# isbn
try:
book_data["isbn"] = cols[4].xpath('.//span')[0].text
@ -136,88 +158,94 @@ class LibraryThing(object):
book_data["isbn"] = None
except Exception, e:
book_data["isbn"] = None
yield book_data
def parse_user_catalog(self, view_style=1):
from lxml import html
# we can vary viewstyle to get different info
IMPLEMENTED_STYLES = [1,5]
IMPLEMENTED_STYLES = [1, 5]
COLLECTION = 2 # set to get All Collections
if view_style not in IMPLEMENTED_STYLES:
raise NotImplementedError()
style_parser = getattr(self,"viewstyle_%s" % view_style)
style_parser = getattr(self, "viewstyle_%s" % view_style)
next_page = True
offset = 0
cookies = None
# go to the front page of LibraryThing first to pick up relevant session-like cookies
r = requests.get("https://www.librarything.com/")
cookies = r.cookies
while next_page:
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username,
view_style, COLLECTION, offset)
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
self.username, view_style, COLLECTION, offset
)
logger.info("url: %s", url)
if cookies is None:
r = requests.get(url)
else:
r = requests.get(url, cookies=cookies)
if r.status_code != httplib.OK:
raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code))
etree = html.fromstring(r.content)
cookies = r.cookies # retain the cookies
# look for a page bar
# try to grab the total number of books
# 1 - 50 of 82
try:
count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
total = int(re.search(r'(\d+)$',count_text).group(1))
total = int(re.search(r'(\d+)$', count_text).group(1))
logger.info('total: %d', total)
except Exception, e: # assume for now that if we can't grab this text, there is no page bar and no books
except Exception, e:
# assume for now that if we can't grab this text,
# there is no page bar and no books
logger.info('Exception {0}'.format(e))
total = 0
# to do paging we can either look for a next link or just increase the offset by the number of rows.
# to do paging we can either look for a next link or just increase the offset
# by the number of rows.
# Let's try the latter
# possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
rows_xpath = '//table[@id="lt_catalog_list"]/tbody/tr'
# deal with page 1 first and then working on paging through the collection
rows = etree.xpath(rows_xpath)
i = -1 # have to account for the problem of style_parser(rows) returning nothing
for (i,row) in enumerate(style_parser(rows)):
yield row
# page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go
offset += i + 1
i = -1 # have to account for the problem of style_parser(rows) returning nothing
for (i, row) in enumerate(style_parser(rows)):
yield row
# page size = 50, first page offset = 0, second page offset = 50
# -- if total = 50 no need to go
offset += i + 1
if offset >= total:
next_page = False
def load_librarything_into_wishlist(user, lt_username, max_books=None):
"""
Load a specified LibraryThing shelf (by default: all the books from the LibraryThing account associated with user)
Load a specified LibraryThing shelf (by default: all the books
from the LibraryThing account associated with user)
"""
from regluit.core import bookloader
from regluit.core import tasks
from itertools import islice
logger.info("Entering into load_librarything_into_wishlist")
lt = LibraryThing(lt_username)
for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)):
for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)):
isbn = book["isbn"] # grab the first one
logger.info("%d %s %s", i, book["title"]["title"], isbn)
try:
@ -227,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
if not edition:
continue
# add the librarything ids to the db since we know them now
identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work)
identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work)
identifier = models.Identifier.get_or_add(
type='thng',
value=book['book_id'],
edition=edition,
work=edition.work
)
identifier = models.Identifier.get_or_add(
type='ltwk',
value=book['work_id'],
work=edition.work
)
if book['lc_call_number']:
identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work)
identifier = models.Identifier.get_or_add(
type='lccn',
value=book['lc_call_number'],
edition=edition,
work=edition.work
)
user.wishlist.add_work(edition.work, 'librarything', notify=True)
if edition.new:
tasks.populate_edition.delay(edition.isbn_13)
logger.info("Work with isbn %s added to wishlist.", isbn)
except Exception, e:
logger.info ("error adding ISBN %s: %s", isbn, e)
logger.info("error adding ISBN %s: %s", isbn, e)