delint
parent
1425b29d49
commit
957cf615b4
|
@ -2,11 +2,12 @@ import csv
|
|||
import HTMLParser
|
||||
import httplib
|
||||
import logging
|
||||
import mechanize
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import mechanize
|
||||
import requests
|
||||
|
||||
from datetime import datetime
|
||||
from regluit.core import models
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -40,7 +41,8 @@ class LibraryThing(object):
|
|||
def parse_csv(self):
|
||||
h = HTMLParser.HTMLParser()
|
||||
reader = csv.DictReader(self.csv_handle)
|
||||
# There are more fields to be parsed out. Note that there is a second author column to handle
|
||||
# There are more fields to be parsed out. Note that there is a
|
||||
# second author column to handle
|
||||
for (i, row) in enumerate(reader):
|
||||
# ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
|
||||
m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
|
||||
|
@ -48,35 +50,50 @@ class LibraryThing(object):
|
|||
isbn = m.group(1).split()
|
||||
else:
|
||||
isbn = []
|
||||
yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]),
|
||||
'isbn':isbn, 'comment':row["'COMMENT'"],
|
||||
'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"],
|
||||
'reviews':h.unescape(row["'REVIEWS'"])}
|
||||
yield {
|
||||
'title':h.unescape(row["'TITLE'"]),
|
||||
'author':h.unescape(row["'AUTHOR (first, last)'"]),
|
||||
'isbn':isbn,
|
||||
'comment':row["'COMMENT'"],
|
||||
'tags':row["'TAGS'"],
|
||||
'collections':row["'COLLECTIONS'"],
|
||||
'reviews':h.unescape(row["'REVIEWS'"])
|
||||
}
|
||||
def viewstyle_1(self, rows):
|
||||
|
||||
for (i, row) in enumerate(rows):
|
||||
book_data = {}
|
||||
cols = row.xpath('td')
|
||||
# cover
|
||||
book_data["cover"] = {"cover_id":cols[0].attrib["id"],
|
||||
"image": {"width":cols[0].xpath('.//img')[0].attrib['width'],
|
||||
"src": cols[0].xpath('.//img')[0].attrib['src']}
|
||||
book_data["cover"] = {
|
||||
"cover_id":cols[0].attrib["id"],
|
||||
"image": {
|
||||
"width":cols[0].xpath('.//img')[0].attrib['width'],
|
||||
"src": cols[0].xpath('.//img')[0].attrib['src']
|
||||
}
|
||||
}
|
||||
# title
|
||||
book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'],
|
||||
"title":cols[1].xpath('.//a')[0].text}
|
||||
book_data["title"] = {
|
||||
"href":cols[1].xpath('.//a')[0].attrib['href'],
|
||||
"title":cols[1].xpath('.//a')[0].text
|
||||
}
|
||||
|
||||
# extract work_id and book_id from href
|
||||
try:
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match(
|
||||
"^/work/(.*)/book/(.*)$",
|
||||
book_data["title"]["href"]
|
||||
).groups()
|
||||
except:
|
||||
(book_data["work_id"], book_data["book_id"]) = (None, None)
|
||||
|
||||
# author -- what if there is more than 1? or none?
|
||||
try:
|
||||
book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text,
|
||||
book_data["author"] = {
|
||||
"display_name":cols[2].xpath('.//a')[0].text,
|
||||
"href":cols[2].xpath('.//a')[0].attrib['href'],
|
||||
"name":cols[2].xpath('div')[0].text}
|
||||
"name":cols[2].xpath('div')[0].text
|
||||
}
|
||||
except:
|
||||
book_data["author"] = None
|
||||
|
||||
|
@ -91,7 +108,9 @@ class LibraryThing(object):
|
|||
book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
|
||||
|
||||
# entry date
|
||||
book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y"))
|
||||
book_data["entry_date"] = datetime.date(
|
||||
datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
|
||||
)
|
||||
|
||||
yield book_data
|
||||
|
||||
|
@ -107,7 +126,10 @@ class LibraryThing(object):
|
|||
|
||||
# extract work_id and book_id from href
|
||||
try:
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
|
||||
(book_data["work_id"], book_data["book_id"]) = re.match(
|
||||
"^/work/(.*)/book/(.*)$",
|
||||
book_data["title"]["href"]
|
||||
).groups()
|
||||
except:
|
||||
(book_data["work_id"], book_data["book_id"]) = (None, None)
|
||||
|
||||
|
@ -160,8 +182,9 @@ class LibraryThing(object):
|
|||
cookies = r.cookies
|
||||
|
||||
while next_page:
|
||||
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username,
|
||||
view_style, COLLECTION, offset)
|
||||
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
|
||||
self.username, view_style, COLLECTION, offset
|
||||
)
|
||||
logger.info("url: %s", url)
|
||||
if cookies is None:
|
||||
r = requests.get(url)
|
||||
|
@ -180,11 +203,14 @@ class LibraryThing(object):
|
|||
count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
|
||||
total = int(re.search(r'(\d+)$', count_text).group(1))
|
||||
logger.info('total: %d', total)
|
||||
except Exception, e: # assume for now that if we can't grab this text, there is no page bar and no books
|
||||
except Exception, e:
|
||||
# assume for now that if we can't grab this text,
|
||||
# there is no page bar and no books
|
||||
logger.info('Exception {0}'.format(e))
|
||||
total = 0
|
||||
|
||||
# to do paging we can either look for a next link or just increase the offset by the number of rows.
|
||||
# to do paging we can either look for a next link or just increase the offset
|
||||
# by the number of rows.
|
||||
# Let's try the latter
|
||||
# possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
|
||||
|
||||
|
@ -198,7 +224,8 @@ class LibraryThing(object):
|
|||
for (i, row) in enumerate(style_parser(rows)):
|
||||
yield row
|
||||
|
||||
# page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go
|
||||
# page size = 50, first page offset = 0, second page offset = 50
|
||||
# -- if total = 50 no need to go
|
||||
|
||||
offset += i + 1
|
||||
if offset >= total:
|
||||
|
@ -206,7 +233,8 @@ class LibraryThing(object):
|
|||
|
||||
def load_librarything_into_wishlist(user, lt_username, max_books=None):
|
||||
"""
|
||||
Load a specified LibraryThing shelf (by default: all the books from the LibraryThing account associated with user)
|
||||
Load a specified LibraryThing shelf (by default: all the books
|
||||
from the LibraryThing account associated with user)
|
||||
"""
|
||||
|
||||
from regluit.core import bookloader
|
||||
|
@ -227,10 +255,24 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
|
|||
if not edition:
|
||||
continue
|
||||
# add the librarything ids to the db since we know them now
|
||||
identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work)
|
||||
identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work)
|
||||
identifier = models.Identifier.get_or_add(
|
||||
type='thng',
|
||||
value=book['book_id'],
|
||||
edition=edition,
|
||||
work=edition.work
|
||||
)
|
||||
identifier = models.Identifier.get_or_add(
|
||||
type='ltwk',
|
||||
value=book['work_id'],
|
||||
work=edition.work
|
||||
)
|
||||
if book['lc_call_number']:
|
||||
identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work)
|
||||
identifier = models.Identifier.get_or_add(
|
||||
type='lccn',
|
||||
value=book['lc_call_number'],
|
||||
edition=edition,
|
||||
work=edition.work
|
||||
)
|
||||
user.wishlist.add_work(edition.work, 'librarything', notify=True)
|
||||
if edition.new:
|
||||
tasks.populate_edition.delay(edition.isbn_13)
|
||||
|
|
Loading…
Reference in New Issue