Merge pull request #85 from EbookFoundation/catchup

Catchup
pull/87/head
eshellman 2018-06-06 22:44:38 -04:00 committed by GitHub
commit 985b65ea3f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
78 changed files with 1551 additions and 55413 deletions

View File

@ -25,7 +25,7 @@ def onix_feed(facet, max=None):
editions = facet.facet_object.filter_model("Edition",editions).distinct()
for edition in editions:
edition_prod = product(edition, facet.facet_object)
if edition_prod:
if edition_prod is not None:
feed.append(edition_prod)
return etree.tostring(feed, pretty_print=True)
@ -34,7 +34,7 @@ def onix_feed_for_work(work):
feed.append(header(work))
for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct():
edition_prod = product(edition)
if edition_prod:
if edition_prod is not None:
feed.append(product(edition))
return etree.tostring(feed, pretty_print=True)

View File

@ -10,6 +10,7 @@ django imports
from django.contrib.auth.models import User
from django.test import TestCase
from django.test.client import Client
from django.utils.timezone import now
"""
regluit imports
@ -17,7 +18,6 @@ regluit imports
import regluit.core.isbn
from regluit.core import models
from regluit.utils.localdatetime import now
from regluit.api import models as apimodels
class ApiTests(TestCase):

View File

@ -40,7 +40,7 @@ urlpatterns = [
url(r"^onix/(?P<facet>.*)/$", OnixView.as_view(), name="onix"),
url(r"^onix/$", OnixView.as_view(), name="onix_all"),
url(r'^id/work/(?P<work_id>\w+)/$', negotiate_content, name="work_identifier"),
url(r'^loader/yaml$',load_yaml, name="load_yaml"),
url(r'^travisci/webhook$',travisci_webhook, name="travisci_webhook"),
url(r'^loader/yaml$', load_yaml, name="load_yaml"),
url(r'^travisci/webhook$', travisci_webhook, name="travisci_webhook"),
url(r'^', include(v1_api.urls)),
]

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -5,25 +5,24 @@ from urllib import quote
from functools import partial
from xml.etree import ElementTree
from django.apps import apps
from . exceptions import BooXtreamError
from . models import Boox
class BooXtream(object):
""" ``apikey``
The API key for your BooXtream account, obtained from BooXtream. Defaults to using
The API key for your BooXtream account, obtained from BooXtream. Defaults to using
settings.BOOXTREAM_API_KEY
``apiuser``
The username key for your BooXtream account, obtained from BooXtream. Defaults to using
The username key for your BooXtream account, obtained from BooXtream. Defaults to using
settings.BOOXTREAM_API_USER
``timeout``
passed to requests
"""
def __init__(self,
@ -36,58 +35,60 @@ class BooXtream(object):
apiuser = settings.BOOXTREAM_API_USER
self.endpoint = 'https://service.booxtream.com/'
self.postrequest = partial(requests.post, timeout=timeout, auth=(apiuser,apikey))
def platform(self, epubfile=None, epub=True, kf8mobi=False, **kwargs):
""" Make an API request to BooXtream
""" Make an API request to BooXtream
``self.apikey``, ``epubfile`` and the supplied ``kwargs``.
Attempts to deserialize the XML response and return the download link.
Will raise ``BooXtreamError`` if BooXtream returns an exception
code.
"""
url = self.endpoint + 'booxtream.xml'
Boox = apps.get_model('booxtream', 'Boox')
url = self.endpoint + 'booxtream.xml'
kwargs['epub'] = '1' if epub else '0'
kwargs['kf8mobi'] = '1' if kf8mobi else '0'
if epubfile:
if hasattr(epubfile,'name') and str(epubfile.name).endswith('.epub'):
files= {'epubfile': (str(epubfile.name),epubfile)}
else:
# give it a random file name so that kindlegen doesn't choke
# needed for in-memory (StringIO) epubs
# give it a random file name so that kindlegen doesn't choke
# needed for in-memory (StringIO) epubs
files= {'epubfile': ('%012x.epub' % random.randrange(16**12),epubfile)}
else:
files={}
files={}
if settings.LOCAL_TEST:
# fake it, so you can test other functions without hitting booxtream
boox = Boox.objects.create(
download_link_epub='https://github.com/eshellman/42_ebook/blob/master/download/42.epub?raw=true&extra=download.booxtream.com/',
download_link_mobi='https://github.com/eshellman/42_ebook/blob/master/download/42.mobi?raw=true',
referenceid= kwargs.get('referenceid'),
downloads_remaining= kwargs.get('downloadlimit'),
expirydays=kwargs.get('expirydays'),
)
download_link_epub='https://github.com/eshellman/42_ebook/blob/master/download/42.epub?raw=true&extra=download.booxtream.com/',
download_link_mobi='https://github.com/eshellman/42_ebook/blob/master/download/42.mobi?raw=true',
referenceid= kwargs.get('referenceid'),
downloads_remaining= kwargs.get('downloadlimit'),
expirydays=kwargs.get('expirydays'),
)
return boox
resp = self.postrequest(url, data=kwargs, files=files)
doc = ElementTree.fromstring(resp.content)
# it turns out an Error can have an Error in it
errors = doc.findall('.//Response/Error')
errors = doc.findall('.//Response/Error')
if len(errors) > 0:
raise BooXtreamError(errors)
download_link_epub = doc.find('.//DownloadLink[@type="epub"]')
if download_link_epub is not None:
download_link_epub = download_link_epub.text
download_link_epub = download_link_epub.text
download_link_mobi = doc.find('.//DownloadLink[@type="mobi"]')
if download_link_mobi is not None:
download_link_mobi = download_link_mobi.text
download_link_mobi = download_link_mobi.text
boox = Boox.objects.create(
download_link_epub=download_link_epub,
download_link_mobi=download_link_mobi,
referenceid= kwargs.get('referenceid'),
downloads_remaining= kwargs.get('downloadlimit'),
expirydays=kwargs.get('expirydays'),
)
download_link_epub=download_link_epub,
download_link_mobi=download_link_mobi,
referenceid= kwargs.get('referenceid'),
downloads_remaining= kwargs.get('downloadlimit'),
expirydays=kwargs.get('expirydays'),
)
return boox

View File

@ -1,11 +1,10 @@
from django.apps import AppConfig
from django.db.models.signals import post_migrate
from regluit.core.signals import create_notice_types
class CoreConfig(AppConfig):
name = 'regluit.core'
verbose_name = ' core objects'
def ready(self):
from regluit.core.signals import create_notice_types
post_migrate.connect(create_notice_types, sender=self)

View File

@ -23,6 +23,7 @@ from django_comments.models import Comment
from github3 import (login, GitHub)
from github3.repos.release import Release
from django.utils.timezone import now
from gitenberg.metadata.pandata import Pandata
# regluit imports
@ -31,7 +32,6 @@ import regluit
import regluit.core.isbn
from regluit.core.validation import test_file
from regluit.marc.models import inverse_marc_rels
from regluit.utils.localdatetime import now
from . import cc
from . import models
@ -49,7 +49,7 @@ def add_by_oclc(isbn, work=None):
def add_by_oclc_from_google(oclc):
if oclc:
logger.info("adding book by oclc %s", oclc)
logger.info(u"adding book by oclc %s", oclc)
else:
return None
try:
@ -59,10 +59,10 @@ def add_by_oclc_from_google(oclc):
try:
results = _get_json(url, {"q": '"OCLC%s"' % oclc})
except LookupFailure, e:
logger.exception("lookup failure for %s", oclc)
logger.exception(u"lookup failure for %s", oclc)
return None
if not results.has_key('items') or not results['items']:
logger.warn("no google hits for %s", oclc)
logger.warn(u"no google hits for %s", oclc)
return None
try:
@ -70,16 +70,16 @@ def add_by_oclc_from_google(oclc):
models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
return e
except LookupFailure, e:
logger.exception("failed to add edition for %s", oclc)
logger.exception(u"failed to add edition for %s", oclc)
except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", oclc)
logger.exception(u"google books data for %s didn't fit our db", oclc)
return None
def valid_isbn(isbn):
try:
return identifier_cleaner('isbn')(isbn)
except:
logger.exception("invalid isbn: %s", isbn)
logger.exception(u"invalid isbn: %s", isbn)
return None
def add_by_isbn(isbn, work=None, language='xx', title=''):
@ -88,13 +88,17 @@ def add_by_isbn(isbn, work=None, language='xx', title=''):
try:
e = add_by_isbn_from_google(isbn, work=work)
except LookupFailure:
logger.exception("failed google lookup for %s", isbn)
logger.exception(u"failed google lookup for %s", isbn)
# try again some other time
return None
if e:
if e.work.language == 'xx' and language != 'xx':
e.work.language == language
e.work.save()
logger.info('changed language for {} to {}'.format(isbn, language))
return e
logger.info("null came back from add_by_isbn_from_google: %s", isbn)
logger.info(u"null came back from add_by_isbn_from_google: %s", isbn)
# if there's a a title, we want to create stub editions and
# works, even if google doesn't know about it # but if it's not valid,
@ -129,10 +133,10 @@ def get_google_isbn_results(isbn):
try:
results = _get_json(url, {"q": "isbn:%s" % isbn})
except LookupFailure:
logger.exception("lookup failure for %s", isbn)
logger.exception(u"lookup failure for %s", isbn)
return None
if not results.has_key('items') or not results['items']:
logger.warn("no google hits for %s", isbn)
logger.warn(u"no google hits for %s", isbn)
return None
return results
@ -201,7 +205,7 @@ def update_edition(edition):
# if the language of the edition no longer matches that of the parent work,
# attach edition to the
if edition.work.language != language:
logger.info("reconnecting %s since it is %s instead of %s",
logger.info(u"reconnecting %s since it is %s instead of %s",
googlebooks_id, language, edition.work.language)
old_work = edition.work
@ -210,7 +214,7 @@ def update_edition(edition):
edition.work = new_work
edition.save()
for identifier in edition.identifiers.all():
logger.info("moving identifier %s", identifier.value)
logger.info(u"moving identifier %s", identifier.value)
identifier.work = new_work
identifier.save()
if old_work and old_work.editions.count() == 0:
@ -256,7 +260,7 @@ def add_by_isbn_from_google(isbn, work=None):
edition.new = False
return edition
logger.info("adding new book by isbn %s", isbn)
logger.info(u"adding new book by isbn %s", isbn)
results = get_google_isbn_results(isbn)
if results:
try:
@ -267,9 +271,9 @@ def add_by_isbn_from_google(isbn, work=None):
isbn=isbn
)
except LookupFailure, e:
logger.exception("failed to add edition for %s", isbn)
logger.exception(u"failed to add edition for %s", isbn)
except IntegrityError, e:
logger.exception("google books data for %s didn't fit our db", isbn)
logger.exception(u"google books data for %s didn't fit our db", isbn)
return None
return None
@ -320,7 +324,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
if results:
item = results
else:
logger.info("loading metadata from google for %s", googlebooks_id)
logger.info(u"loading metadata from google for %s", googlebooks_id)
url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
item = _get_json(url)
d = item['volumeInfo']
@ -343,7 +347,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
if len(language) > 5:
language = language[0:5]
if work and work.language != language:
logger.info("not connecting %s since it is %s instead of %s",
logger.info(u"not connecting %s since it is %s instead of %s",
googlebooks_id, language, work.language)
work = None
# isbn = None
@ -371,7 +375,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
try:
e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
e.new = False
logger.warning(" whoa nellie, somebody else created an edition while we were working.")
logger.warning(u" whoa nellie, somebody else created an edition while we were working.")
if work.new:
work.delete()
return e
@ -404,19 +408,19 @@ def relate_isbn(isbn, cluster_size=1):
"""add a book by isbn and then see if there's an existing work to add it to so as to make a
cluster bigger than cluster_size.
"""
logger.info("finding a related work for %s", isbn)
logger.info(u"finding a related work for %s", isbn)
edition = add_by_isbn(isbn)
if edition is None:
return None
if edition.work is None:
logger.info("didn't add related to null work")
logger.info(u"didn't add related to null work")
return None
if edition.work.editions.count() > cluster_size:
return edition.work
for other_isbn in thingisbn(isbn):
# 979's come back as 13
logger.debug("other_isbn: %s", other_isbn)
logger.debug(u"other_isbn: %s", other_isbn)
if len(other_isbn) == 10:
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
related_edition = add_by_isbn(other_isbn, work=edition.work)
@ -427,7 +431,7 @@ def relate_isbn(isbn, cluster_size=1):
related_edition.work = edition.work
related_edition.save()
elif related_edition.work_id != edition.work_id:
logger.debug("merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
logger.debug(u"merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
merge_works(related_edition.work, edition.work)
if related_edition.work.editions.count() > cluster_size:
return related_edition.work
@ -438,7 +442,7 @@ def add_related(isbn):
The initial seed ISBN will be added if it's not already there.
"""
# make sure the seed edition is there
logger.info("adding related editions for %s", isbn)
logger.info(u"adding related editions for %s", isbn)
new_editions = []
@ -446,14 +450,14 @@ def add_related(isbn):
if edition is None:
return new_editions
if edition.work is None:
logger.warning("didn't add related to null work")
logger.warning(u"didn't add related to null work")
return new_editions
# this is the work everything will hang off
work = edition.work
other_editions = {}
for other_isbn in thingisbn(isbn):
# 979's come back as 13
logger.debug("other_isbn: %s", other_isbn)
logger.debug(u"other_isbn: %s", other_isbn)
if len(other_isbn) == 10:
other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
related_edition = add_by_isbn(other_isbn, work=work)
@ -466,7 +470,7 @@ def add_related(isbn):
related_edition.work = work
related_edition.save()
elif related_edition.work_id != work.id:
logger.debug("merge_works path 1 %s %s", work.id, related_edition.work_id)
logger.debug(u"merge_works path 1 %s %s", work.id, related_edition.work_id)
work = merge_works(work, related_edition.work)
else:
if other_editions.has_key(related_language):
@ -476,14 +480,14 @@ def add_related(isbn):
# group the other language editions together
for lang_group in other_editions.itervalues():
logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
logger.debug(u"lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
if len(lang_group) > 1:
lang_edition = lang_group[0]
logger.debug("lang_edition.id: %s", lang_edition.id)
logger.debug(u"lang_edition.id: %s", lang_edition.id)
# compute the distinct set of works to merge into lang_edition.work
works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
for w in works_to_merge:
logger.debug("merge_works path 2 %s %s", lang_edition.work_id, w.id)
logger.debug(u"merge_works path 2 %s %s", lang_edition.work_id, w.id)
merged_work = merge_works(lang_edition.work, w)
models.WorkRelation.objects.get_or_create(
to_work=lang_group[0].work,
@ -498,17 +502,21 @@ def thingisbn(isbn):
Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns,
which come back as isbn_13')
"""
logger.info("looking up %s at ThingISBN", isbn)
logger.info(u"looking up %s at ThingISBN", isbn)
url = "https://www.librarything.com/api/thingISBN/%s" % isbn
xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
doc = ElementTree.fromstring(xml)
return [e.text for e in doc.findall('isbn')]
try:
doc = ElementTree.fromstring(xml)
return [e.text for e in doc.findall('isbn')]
except SyntaxError:
# LibraryThing down
return []
def merge_works(w1, w2, user=None):
"""will merge the second work (w2) into the first (w1)
"""
logger.info("merging work %s into %s", w2.id, w1.id)
logger.info(u"merging work %s into %s", w2.id, w1.id)
# don't merge if the works are the same or at least one of the works has no id
#(for example, when w2 has already been deleted)
if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
@ -583,7 +591,7 @@ def detach_edition(e):
will detach edition from its work, creating a new stub work. if remerge=true, will see if
there's another work to attach to
"""
logger.info("splitting edition %s from %s", e, e.work)
logger.info(u"splitting edition %s from %s", e, e.work)
w = models.Work(title=e.title, language=e.work.language)
w.save()
@ -618,7 +626,7 @@ def add_openlibrary(work, hard_refresh=False):
work.save()
# find the first ISBN match in OpenLibrary
logger.info("looking up openlibrary data for work %s", work.id)
logger.info(u"looking up openlibrary data for work %s", work.id)
e = None # openlibrary edition json
w = None # openlibrary work json
@ -633,7 +641,7 @@ def add_openlibrary(work, hard_refresh=False):
try:
e = _get_json(url, params, type='ol')
except LookupFailure:
logger.exception("OL lookup failed for %s", isbn_key)
logger.exception(u"OL lookup failed for %s", isbn_key)
e = {}
if e.has_key(isbn_key):
if e[isbn_key].has_key('details'):
@ -673,7 +681,7 @@ def add_openlibrary(work, hard_refresh=False):
)
if e[isbn_key]['details'].has_key('works'):
work_key = e[isbn_key]['details']['works'].pop(0)['key']
logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
logger.info(u"got openlibrary work %s for isbn %s", work_key, isbn_key)
models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
try:
w = _get_json("https://openlibrary.org" + work_key, type='ol')
@ -691,14 +699,14 @@ def add_openlibrary(work, hard_refresh=False):
if w.has_key('subjects') and len(w['subjects']) > len(subjects):
subjects = w['subjects']
except LookupFailure:
logger.exception("OL lookup failed for %s", work_key)
logger.exception(u"OL lookup failed for %s", work_key)
if not subjects:
logger.warn("unable to find work %s at openlibrary", work.id)
logger.warn(u"unable to find work %s at openlibrary", work.id)
return
# add the subjects to the Work
for s in subjects:
logger.info("adding subject %s to work %s", s, work.id)
logger.info(u"adding subject %s to work %s", s, work.id)
subject = models.Subject.set_by_name(s, work=work)
work.save()
@ -716,9 +724,9 @@ def _get_json(url, params={}, type='gb'):
if response.status_code == 200:
return json.loads(response.content)
else:
logger.error("unexpected HTTP response: %s", response)
logger.error(u"unexpected HTTP response: %s", response)
if response.content:
logger.error("response content: %s", response.content)
logger.error(u"response content: %s", response.content)
raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))
@ -766,7 +774,7 @@ def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url
ebook = models.Ebook()
if len(ebooks) > 1:
logger.warning("There is more than one Ebook matching url {0}".format(url))
logger.warning(u"There is more than one Ebook matching url {0}".format(url))
ebook.format = format
@ -826,8 +834,6 @@ def edition_for_etype(etype, metadata, default=None):
for key in metadata.edition_identifiers.keys():
return edition_for_ident(key, metadata.identifiers[key])
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
def load_ebookfile(url, etype):
'''
return a ContentFile if a new ebook has been loaded
@ -960,8 +966,7 @@ class BasePandataLoader(object):
if contentfile:
contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
path = default_storage.save(contentfile_name, contentfile)
lic = MATCH_LICENSE.search(metadata.rights_url)
license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
license = cc.license_from_cc_url(metadata.rights_url)
ebf = models.EbookFile.objects.create(
format=key,
edition=edition,

View File

@ -1,8 +1,11 @@
# coding=utf-8
# mostly constants related to Creative Commons
''' mostly constants related to Creative Commons
# let's be DRY with these parameters
## need to add versioned CC entries
'''
import re
INFO_CC = (
('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'),
@ -162,3 +165,15 @@ def match_license(license_string):
except ValueError:
pass
return RIGHTS_ALIAS.get(license_string, None)
MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
def license_from_cc_url(rights_url):
if not rights_url:
return None
lic = MATCH_LICENSE.search(rights_url)
if lic:
return 'CC {}'.format(lic.group(1).upper())
if rights_url.find('openedition.org') >= 0:
return 'OPENEDITION'
return ''

View File

@ -45,10 +45,10 @@ def convert_10_to_13(isbn):
except:
return None
ISBN_REGEX = re.compile(r'^(\d{9}|\d{12})(\d|X)$')
DASH_REGEX = re.compile(r'[ \-–—]+')
ISBN_REGEX = re.compile(r'^(\d{9}[\dX]|\d{13})$')
DASH_REGEX = re.compile(u'[ \\-–—‐,;]+') #includes unicode hyphen, endash and emdash
def strip(s):
"""Strips away any - or spaces. If the remaining string is of length 10 or 13
"""Strips away any - or spaces and some punctuation. If the remaining string is of length 10 or 13
with digits only in anything but the last
check digit (which may be X), then return '' -- otherwise return the remaining string
"""

View File

@ -2,11 +2,12 @@ import csv
import HTMLParser
import httplib
import logging
import mechanize
import re
from datetime import datetime
import mechanize
import requests
from datetime import datetime
from regluit.core import models
logger = logging.getLogger(__name__)
@ -20,7 +21,7 @@ class LibraryThing(object):
"""
url = "https://www.librarything.com"
csv_file_url = "https://www.librarything.com/export-csv"
def __init__(self, username=None, password=None):
self.username = username
self.password = password
@ -40,77 +41,98 @@ class LibraryThing(object):
def parse_csv(self):
h = HTMLParser.HTMLParser()
reader = csv.DictReader(self.csv_handle)
# There are more fields to be parsed out. Note that there is a second author column to handle
for (i,row) in enumerate(reader):
# There are more fields to be parsed out. Note that there is a
# second author column to handle
for (i, row) in enumerate(reader):
# ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
if m:
isbn = m.group(1).split()
else:
isbn = []
yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]),
'isbn':isbn, 'comment':row["'COMMENT'"],
'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"],
'reviews':h.unescape(row["'REVIEWS'"])}
yield {
'title':h.unescape(row["'TITLE'"]),
'author':h.unescape(row["'AUTHOR (first, last)'"]),
'isbn':isbn,
'comment':row["'COMMENT'"],
'tags':row["'TAGS'"],
'collections':row["'COLLECTIONS'"],
'reviews':h.unescape(row["'REVIEWS'"])
}
def viewstyle_1(self, rows):
for (i,row) in enumerate(rows):
for (i, row) in enumerate(rows):
book_data = {}
cols = row.xpath('td')
# cover
book_data["cover"] = {"cover_id":cols[0].attrib["id"],
"image": {"width":cols[0].xpath('.//img')[0].attrib['width'],
"src": cols[0].xpath('.//img')[0].attrib['src']}
book_data["cover"] = {
"cover_id":cols[0].attrib["id"],
"image": {
"width":cols[0].xpath('.//img')[0].attrib['width'],
"src": cols[0].xpath('.//img')[0].attrib['src']
}
}
# title
book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'],
"title":cols[1].xpath('.//a')[0].text}
book_data["title"] = {
"href":cols[1].xpath('.//a')[0].attrib['href'],
"title":cols[1].xpath('.//a')[0].text
}
# extract work_id and book_id from href
try:
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
(book_data["work_id"], book_data["book_id"]) = re.match(
"^/work/(.*)/book/(.*)$",
book_data["title"]["href"]
).groups()
except:
(book_data["work_id"], book_data["book_id"]) = (None, None)
# author -- what if there is more than 1? or none?
try:
book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text,
"href":cols[2].xpath('.//a')[0].attrib['href'],
"name":cols[2].xpath('div')[0].text}
book_data["author"] = {
"display_name":cols[2].xpath('.//a')[0].text,
"href":cols[2].xpath('.//a')[0].attrib['href'],
"name":cols[2].xpath('div')[0].text
}
except:
book_data["author"] = None
# date
book_data["date"] = cols[3].xpath('span')[0].text
# tags: grab tags that are not empty strings
tag_links = cols[4].xpath('.//a')
book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
# rating -- count # of stars
book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
# entry date
book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y"))
book_data["entry_date"] = datetime.date(
datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
)
yield book_data
def viewstyle_5(self, rows):
# implement this view to get at the ISBNs
for (i,row) in enumerate(rows):
for (i, row) in enumerate(rows):
book_data = {}
cols = row.xpath('td')
# title
book_data["title"] = {"href":cols[0].xpath('.//a')[0].attrib['href'],
"title":cols[0].xpath('.//a')[0].text}
# extract work_id and book_id from href
try:
(book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
(book_data["work_id"], book_data["book_id"]) = re.match(
"^/work/(.*)/book/(.*)$",
book_data["title"]["href"]
).groups()
except:
(book_data["work_id"], book_data["book_id"]) = (None, None)
# tags
tag_links = cols[1].xpath('.//a')
book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
@ -121,13 +143,13 @@ class LibraryThing(object):
except Exception, e:
logger.info("no lc call number for: %s %s", book_data["title"], e)
book_data["lc_call_number"] = None
# subject
subjects = cols[3].xpath('.//div[@class="subjectLine"]')
book_data["subjects"] = [{'href':s.xpath('a')[0].attrib['href'],
'text':s.xpath('a')[0].text} for s in subjects]
# isbn
try:
book_data["isbn"] = cols[4].xpath('.//span')[0].text
@ -136,90 +158,94 @@ class LibraryThing(object):
book_data["isbn"] = None
except Exception, e:
book_data["isbn"] = None
yield book_data
def parse_user_catalog(self, view_style=1):
from lxml import html
# we can vary viewstyle to get different info
IMPLEMENTED_STYLES = [1,5]
IMPLEMENTED_STYLES = [1, 5]
COLLECTION = 2 # set to get All Collections
if view_style not in IMPLEMENTED_STYLES:
raise NotImplementedError()
style_parser = getattr(self,"viewstyle_%s" % view_style)
style_parser = getattr(self, "viewstyle_%s" % view_style)
next_page = True
offset = 0
cookies = None
# go to the front page of LibraryThing first to pick up relevant session-like cookies
r = requests.get("https://www.librarything.com/")
cookies = r.cookies
while next_page:
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username,
view_style, COLLECTION, offset)
url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
self.username, view_style, COLLECTION, offset
)
logger.info("url: %s", url)
if cookies is None:
r = requests.get(url)
else:
r = requests.get(url, cookies=cookies)
if r.status_code != httplib.OK:
raise LibraryThingException("Error accessing %s: %s" % (url, e))
logger.info("Error accessing %s: %s", url, e)
raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code))
etree = html.fromstring(r.content)
#logger.info("r.content %s", r.content)
cookies = r.cookies # retain the cookies
# look for a page bar
# try to grab the total number of books
# 1 - 50 of 82
try:
count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
total = int(re.search(r'(\d+)$',count_text).group(1))
total = int(re.search(r'(\d+)$', count_text).group(1))
logger.info('total: %d', total)
except Exception, e: # assume for now that if we can't grab this text, there is no page bar and no books
except Exception, e:
# assume for now that if we can't grab this text,
# there is no page bar and no books
logger.info('Exception {0}'.format(e))
total = 0
# to do paging we can either look for a next link or just increase the offset by the number of rows.
# to do paging we can either look for a next link or just increase the offset
# by the number of rows.
# Let's try the latter
# possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
rows_xpath = '//table[@id="lt_catalog_list"]/tbody/tr'
# deal with page 1 first and then working on paging through the collection
rows = etree.xpath(rows_xpath)
i = -1 # have to account for the problem of style_parser(rows) returning nothing
for (i,row) in enumerate(style_parser(rows)):
yield row
# page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go
offset += i + 1
i = -1 # have to account for the problem of style_parser(rows) returning nothing
for (i, row) in enumerate(style_parser(rows)):
yield row
# page size = 50, first page offset = 0, second page offset = 50
# -- if total = 50 no need to go
offset += i + 1
if offset >= total:
next_page = False
def load_librarything_into_wishlist(user, lt_username, max_books=None):
"""
Load a specified LibraryThing shelf (by default: all the books from the LibraryThing account associated with user)
Load a specified LibraryThing shelf (by default: all the books
from the LibraryThing account associated with user)
"""
from regluit.core import bookloader
from regluit.core import tasks
from itertools import islice
logger.info("Entering into load_librarything_into_wishlist")
lt = LibraryThing(lt_username)
for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)):
for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)):
isbn = book["isbn"] # grab the first one
logger.info("%d %s %s", i, book["title"]["title"], isbn)
try:
@ -229,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
if not edition:
continue
# add the librarything ids to the db since we know them now
identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work)
identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work)
identifier = models.Identifier.get_or_add(
type='thng',
value=book['book_id'],
edition=edition,
work=edition.work
)
identifier = models.Identifier.get_or_add(
type='ltwk',
value=book['work_id'],
work=edition.work
)
if book['lc_call_number']:
identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work)
identifier = models.Identifier.get_or_add(
type='lccn',
value=book['lc_call_number'],
edition=edition,
work=edition.work
)
user.wishlist.add_work(edition.work, 'librarything', notify=True)
if edition.new:
tasks.populate_edition.delay(edition.isbn_13)
logger.info("Work with isbn %s added to wishlist.", isbn)
except Exception, e:
logger.info ("error adding ISBN %s: %s", isbn, e)
logger.info("error adding ISBN %s: %s", isbn, e)

View File

@ -16,10 +16,10 @@ from .smashwords import SmashwordsScraper
def get_scraper(url):
scrapers = [
PressbooksScraper,
HathitrustScraper,
SpringerScraper,
UbiquityScraper,
SmashwordsScraper,
HathitrustScraper,
BaseScraper,
]
for scraper in scrapers:
@ -52,3 +52,9 @@ def add_by_webpage(url, work=None, user=None):
def add_by_sitemap(url, maxnum=None):
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
def scrape_language(url):
scraper = get_scraper(url)
return scraper.metadata.get('language')

View File

@ -1,42 +1,54 @@
#!/usr/bin/env python
# encoding: utf-8
import logging
import datetime
import json
import logging
import re
from itertools import islice
import requests
from django.db.models import (Q, F)
from django.db.models import Q
from django.core.files.storage import default_storage
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
import regluit
from oaipmh.client import Client
from oaipmh.error import IdDoesNotExistError
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from regluit.core import bookloader, cc
from regluit.core import models, tasks
from regluit.core import bookloader
from regluit.core.bookloader import add_by_isbn, merge_works
from regluit.core.bookloader import merge_works
from regluit.core.isbn import ISBN
from regluit.core.validation import valid_subject
from regluit.core.loaders.utils import type_for_url
from regluit.core.validation import identifier_cleaner, valid_subject
from . import scrape_language
from .doab_utils import doab_lang_to_iso_639_1, online_to_download, url_to_provider
logger = logging.getLogger(__name__)
springercover = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
def unlist(alist):
if not alist:
return None
return alist[0]
SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg'
def store_doab_cover(doab_id, redo=False):
"""
returns tuple: 1) cover URL, 2) whether newly created (boolean)
"""
cover_file_name= '/doab/%s/cover' % (doab_id)
cover_file_name = '/doab/%s/cover' % (doab_id)
# if we don't want to redo and the cover exists, return the URL of the cover
if not redo and default_storage.exists(cover_file_name):
return (default_storage.url(cover_file_name), False)
# download cover image to cover_file
url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
try:
@ -44,29 +56,31 @@ def store_doab_cover(doab_id, redo=False):
if r.status_code == 302:
redirurl = r.headers['Location']
if redirurl.startswith(u'ftp'):
springerftp = springercover.match(redirurl)
springerftp = SPRINGER_COVER.match(redirurl)
if springerftp:
redirurl = u'https://images.springer.com/sgw/books/medium/{}.jpg'.format(springerftp.groups(1))
redirurl = SPRINGER_IMAGE.format(springerftp.groups(1))
r = requests.get(redirurl)
else:
r = requests.get(url)
else:
r = requests.get(url)
r = requests.get(url)
cover_file = ContentFile(r.content)
cover_file.content_type = r.headers.get('content-type', '')
path = default_storage.save(cover_file_name, cover_file)
default_storage.save(cover_file_name, cover_file)
return (default_storage.url(cover_file_name), True)
except Exception, e:
# if there is a problem, return None for cover URL
logger.warning('Failed to make cover image for doab_id={}: {}'.format(doab_id, e))
return (None, False)
def update_cover_doab(doab_id, edition, store_cover=True):
def update_cover_doab(doab_id, edition, store_cover=True, redo=True):
"""
update the cover url for work with doab_id
if store_cover is True, use the cover from our own storage
"""
if store_cover:
(cover_url, new_cover) = store_doab_cover(doab_id)
(cover_url, new_cover) = store_doab_cover(doab_id, redo=redo)
else:
cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
@ -74,131 +88,133 @@ def update_cover_doab(doab_id, edition, store_cover=True):
edition.cover_image = cover_url
edition.save()
return cover_url
else:
return None
return None
def attach_more_doab_metadata(edition, description, subjects,
publication_date, publisher_name=None, language=None, authors=u''):
"""
for given edition, attach description, subjects, publication date to
corresponding Edition and Work
"""
# if edition doesn't have a publication date, update it
# if edition doesn't have a publication date, update it
if not edition.publication_date:
edition.publication_date = publication_date
# if edition.publisher_name is empty, set it
if not edition.publisher_name:
edition.set_publisher(publisher_name)
edition.save()
# attach description to work if it's not empty
work = edition.work
if not work.description:
work.description = description
# update subjects
for s in subjects:
if valid_subject(s):
models.Subject.set_by_name(s, work=work)
# set reading level of work if it's empty; doab is for adults.
if not work.age_level:
work.age_level = '18-'
if language:
if language and language != 'xx':
work.language = language
work.save()
if authors and authors == authors: # test for authors != NaN
authlist = creator_list(authors)
if edition.authors.all().count() < len(authlist):
edition.authors.clear()
if authlist is not None:
for [rel,auth] in authlist:
for [rel, auth] in authlist:
edition.add_author(auth, rel)
return edition
def add_all_isbns(isbns, work, language=None, title=None):
first_edition = None
for isbn in isbns:
first_edition = None
edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
if edition:
first_edition = first_edition if first_edition else edition
if work and (edition.work_id != work.id):
first_edition = first_edition if first_edition else edition
if work and (edition.work_id != work.id):
if work.created < edition.work.created:
work = merge_works(work, edition.work)
else:
work = merge_works(edition.work, work)
else:
work = edition.work
return first_edition
return work, first_edition
def load_doab_edition(title, doab_id, url, format, rights,
language, isbns,
provider, **kwargs):
"""
load a record from doabooks.org represented by input parameters and return an ebook
"""
logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider))
if language and isinstance(language, list):
language = language[0]
if language == 'xx' and format == 'online':
language = scrape_language(url)
# check to see whether the Edition hasn't already been loaded first
# search by url
ebooks = models.Ebook.objects.filter(url=url)
# 1 match
# > 1 matches
# 0 match
# simplest case -- if match (1 or more), we could check whether any
# ebook.edition.work has a doab id matching given doab_id
# put a migration to force Ebook.url to be unique id
# if yes, then return one of the Edition(s) whose work is doab_id
# if no, then
# if no, then
ebook = None
if len(ebooks) > 1:
raise Exception("There is more than one Ebook matching url {0}".format(url))
elif len(ebooks) == 1:
raise Exception("There is more than one Ebook matching url {0}".format(url))
elif len(ebooks) == 1:
ebook = ebooks[0]
doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id,
work=ebook.edition.work)
# update the cover id
cover_url = update_cover_doab(doab_id, ebook.edition)
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=ebook.edition.work)
if not ebook.rights:
ebook.rights = rights
ebook.save()
# update the cover id
cover_url = update_cover_doab(doab_id, ebook.edition, redo=False)
# attach more metadata
attach_more_doab_metadata(ebook.edition,
description=kwargs.get('description'),
subjects=kwargs.get('subject'),
publication_date=kwargs.get('date'),
publisher_name=kwargs.get('publisher'),
language=language,
authors=kwargs.get('authors'),)
attach_more_doab_metadata(
ebook.edition,
description=unlist(kwargs.get('description')),
subjects=kwargs.get('subject'),
publication_date=unlist(kwargs.get('date')),
publisher_name=unlist(kwargs.get('publisher')),
language=language,
authors=kwargs.get('creator'),
)
# make sure all isbns are added
add_all_isbns(isbns, None, language=language, title=title)
return ebook
add_all_isbns(isbns, ebook.edition.work, language=language, title=title)
return ebook.edition
# remaining case --> no ebook, load record, create ebook if there is one.
assert len(ebooks) == 0
assert not ebooks
# we need to find the right Edition/Work to tie Ebook to...
# look for the Edition with which to associate ebook.
# loop through the isbns to see whether we get one that is not None
work = None
edition = add_all_isbns(isbns, None, language=language, title=title)
if edition:
edition.refresh_from_db()
work = edition.work
work, edition = add_all_isbns(isbns, None, language=language, title=title)
if doab_id and not work:
# make sure there's not already a doab_id
idents = models.Identifier.objects.filter(type='doab', value=doab_id)
@ -206,16 +222,17 @@ def load_doab_edition(title, doab_id, url, format, rights,
edition = ident.work.preferred_edition
work = edition.work
break
if edition is not None:
# if this is a new edition, then add related editions asynchronously
if getattr(edition,'new', False):
tasks.populate_edition.delay(edition.isbn_13)
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=edition.work)
# we need to create Edition(s) de novo
else:
if edition is not None:
# if this is a new edition, then add related editions SYNCHRONOUSLY
if getattr(edition, 'new', False):
tasks.populate_edition(edition.isbn_13)
edition.refresh_from_db()
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=edition.work)
# we need to create Edition(s) de novo
else:
# if there is a Work with doab_id already, attach any new Edition(s)
try:
work = models.Identifier.objects.get(type='doab', value=doab_id).work
@ -226,11 +243,11 @@ def load_doab_edition(title, doab_id, url, format, rights,
work = models.Work(language='xx', title=title, age_level='18-')
work.save()
doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
work=work)
work=work)
# if work has any ebooks already, attach the ebook to the corresponding edition
# otherwise pick the first one
# pick the first edition as the one to tie ebook to
# pick the first edition as the one to tie ebook to
editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
Q(ebooks__isnull=False)).distinct()
if editions_with_ebooks:
@ -240,73 +257,41 @@ def load_doab_edition(title, doab_id, url, format, rights,
else:
edition = models.Edition(work=work, title=title)
edition.save()
# make the edition the selected_edition of the work
work.selected_edition = edition
work.save()
if format in ('pdf', 'epub', 'mobi'):
if format in ('pdf', 'epub', 'mobi', 'html', 'online') and rights:
ebook = models.Ebook()
ebook.format = format
ebook.provider = provider
ebook.url = url
ebook.url = url
ebook.rights = rights
# tie the edition to ebook
ebook.edition = edition
if format == "online":
ebook.active = False
ebook.save()
# update the cover id (could be done separately)
cover_url = update_cover_doab(doab_id, edition)
cover_url = update_cover_doab(doab_id, edition, redo=False)
# attach more metadata
attach_more_doab_metadata(edition,
description=kwargs.get('description'),
subjects=kwargs.get('subject'),
publication_date=kwargs.get('date'),
publisher_name=kwargs.get('publisher'),
authors=kwargs.get('authors'),)
return ebook
attach_more_doab_metadata(
edition,
description=unlist(kwargs.get('description')),
subjects=kwargs.get('subject'),
publication_date=unlist(kwargs.get('date')),
publisher_name=unlist(kwargs.get('publisher')),
authors=kwargs.get('creator'),
)
return edition
def load_doab_records(fname, limit=None):
success_count = 0
ebook_count = 0
records = json.load(open(fname))
for (i, book) in enumerate(islice(records,limit)):
d = dict(book)
d['isbns'] = split_isbns(d['isbns_raw']) # use stricter isbn string parsing.
try:
ebook = load_doab_edition(**d)
success_count += 1
if ebook:
ebook_count +=1
except Exception, e:
logger.error(e)
logger.error(book)
logger.info("Number of records processed: " + str(success_count))
logger.info("Number of ebooks processed: " + str(ebook_count))
"""
#
#tools to parse the author lists in doab.csv
from pandas import DataFrame
url = "http://www.doabooks.org/doab?func=csv"
df_csv = DataFrame.from_csv(url)
#
out=[]
for val in df_csv.values:
isbn = split_isbns(val[0])
if isbn:
auths = []
if val[2] == val[2] and val[-2] == val[-2]: # test for NaN auths and licenses
auths = creator_list(val[2])
out.append(( isbn[0], auths))
open("/Users/eric/doab_auths.json","w+").write(json.dumps(out,indent=2, separators=(',', ': ')))
"""
au = re.compile(r'\(Authors?\)', flags=re.U)
ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
@ -326,14 +311,14 @@ def fnf(auth):
if len(parts) == 1:
return parts[0].strip()
elif len(parts) == 2:
return u'{} {}'.format(parts[1].strip(),parts[0].strip())
return u'{} {}'.format(parts[1].strip(), parts[0].strip())
else:
if parts[1].strip() in ('der','van', 'von', 'de', 'ter'):
return u'{} {} {}'.format(parts[2].strip(),parts[1].strip(),parts[0].strip())
if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'):
return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip())
#print auth
#print re.search(namelist,auth).group(0)
return u'{} {}, {}'.format(parts[2].strip(),parts[0].strip(),parts[1].strip())
return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip())
def creator(auth, editor=False):
auth = auth.strip()
@ -349,68 +334,100 @@ def creator(auth, editor=False):
return [u'dsr', fnf(ds.sub(u'', auth))]
if re.search(cm, auth):
return [u'com', fnf(cm.sub(u'', auth))]
auth = au.sub('', auth)
return ['aut', fnf(auth)]
def split_auths(auths):
if ';' in auths or '/' in auths:
return namesep2.split(auths)
else:
nl = namelist.match(auths.strip())
if nl:
if nl.group(3).endswith(' de') \
or ' de ' in nl.group(3) \
or nl.group(3).endswith(' da') \
or nl.group(1).endswith(' Jr.') \
or ' e ' in nl.group(1):
return [auths]
else:
return namesep.split(auths)
else :
return [auths]
def split_isbns(isbns):
result = []
for isbn in isbnsep.split(isbns):
isbn = ISBN(isbn)
if isbn.valid:
result.append(isbn.to_string())
return result
def creator_list(creators):
auths = []
if re.search(edlist, creators):
for auth in split_auths(edlist.sub(u'', creators)):
if auth:
auths.append(creator(auth, editor=True))
else:
for auth in split_auths(unicode(creators)):
if auth:
auths.append(creator(auth))
for auth in creators:
auths.append(creator(auth))
return auths
def load_doab_auths(fname, limit=None):
doab_auths = json.load(open(fname))
recnum = 0
failed = 0
for [isbnraw, authlist] in doab_auths:
isbn = ISBN(isbnraw).to_string()
try:
work = models.Identifier.objects.get(type='isbn',value=isbn).work
except models.Identifier.DoesNotExist:
print 'isbn = {} not found'.format(isbnraw)
failed += 1
if work.preferred_edition.authors.all().count() < len(authlist):
work.preferred_edition.authors.clear()
if authlist is None:
print "null authlist; isbn={}".format(isbn)
DOAB_OAIURL = 'https://www.doabooks.org/oai'
DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*')
mdregistry = MetadataRegistry()
mdregistry.registerReader('oai_dc', oai_dc_reader)
doab_client = Client(DOAB_OAIURL, mdregistry)
isbn_cleaner = identifier_cleaner('isbn', quiet=True)
ISBNSEP = re.compile(r'[/]+')
def add_by_doab(doab_id, record=None):
try:
record = record if record else doab_client.getRecord(
metadataPrefix='oai_dc',
identifier='oai:doab-books:{}'.format(doab_id)
)
metadata = record[1].getMap()
isbns = []
url = None
for ident in metadata.pop('identifier', []):
if ident.startswith('ISBN: '):
isbn_strings = ISBNSEP.split(ident[6:].strip())
for isbn_string in isbn_strings:
isbn = isbn_cleaner(isbn_string)
if isbn:
isbns.append(isbn)
elif ident.find('doabooks.org') >= 0:
# should already know the doab_id
continue
for [rel,auth] in authlist:
work.preferred_edition.add_author(auth, rel)
recnum +=1
if limit and recnum > limit:
break
logger.info("Number of records processed: " + str(recnum))
logger.info("Number of missing isbns: " + str(failed))
else:
url = ident
language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None)))
urls = online_to_download(url)
edition = None
title = unlist(metadata.pop('title', None))
license = cc.license_from_cc_url(unlist(metadata.pop('rights', None)))
for dl_url in urls:
format = type_for_url(dl_url)
if 'format' in metadata:
del metadata['format']
edition = load_doab_edition(
title,
doab_id,
dl_url,
format,
license,
language,
isbns,
url_to_provider(dl_url) if dl_url else None,
**metadata
)
return edition
except IdDoesNotExistError:
return None
def getdoab(url):
id_match = DOAB_PATT.search(url)
if id_match:
return id_match.group(1)
return False
def load_doab_oai(from_year=None, limit=100000):
'''
use oai feed to get oai updates
'''
if from_year:
from_ = datetime.datetime(year=from_year, month=1, day=1)
else:
# last 45 days
from_ = datetime.datetime.now() - datetime.timedelta(days=45)
doab_ids = []
for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_):
if not record[1]:
continue
item_type = unlist(record[1].getMap().get('type', None))
if item_type != 'book':
continue
idents = record[1].getMap()['identifier']
if idents:
for ident in idents:
doab = getdoab(ident)
if doab:
doab_ids.append(doab)
e = add_by_doab(doab, record=record)
title = e.title if e else None
logger.info(u'updated:\t{}\t{}'.format(doab, title))
if len(doab_ids) > limit:
break

128
core/loaders/doab_utils.py Normal file
View File

@ -0,0 +1,128 @@
"""
doab_utils.py
"""
import re
import urlparse
import requests
from regluit.utils.lang import get_language_code
from .utils import get_soup
# utility functions for converting lists of individual items into individual items
# let's do a mapping of the DOAB languages into the language codes used
# mostly, we just handle mispellings
# also null -> xx
EXTRA_LANG_MAP = dict([
(u'chinese', 'de'),
(u'deutsch', 'de'),
(u'eng', 'en'),
(u'englilsh', 'en'),
(u'englilsh', 'en'),
(u'englisch', 'en'),
(u'espanol', 'es'),
(u'ger', 'de'),
(u'fra', 'fr'),
(u'fre', 'fr'),
(u'francese', 'fr'),
(u'ita', 'it'),
(u'italiano', 'it'),
(u'norwegian', 'no'),
(u'por', 'pt'),
(u'portugese', 'pt'),
(u'slovene', 'sl'),
(u'spa', 'es'),
(u'spagnolo', 'es'),
])
sep = re.compile(r'[ \-;^,/]+')
def doab_lang_to_iso_639_1(lang):
if lang is None or not lang:
return "xx"
else:
lang = sep.split(lang)[0]
code = get_language_code(lang)
if code:
return code
else:
return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
DOMAIN_TO_PROVIDER = dict([
[u'antropologie.zcu.cz', u'AntropoWeb'],
[u'books.mdpi.com', u'MDPI Books'],
[u'books.openedition.org', u'OpenEdition Books'],
[u'books.scielo.org', u'SciELO'],
[u'ccdigitalpress.org', u'Computers and Composition Digital Press'],
[u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'],
[u'dl.dropboxusercontent.com', u'Dropbox'],
[u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'],
[u'dx.doi.org', u'DOI Resolver'],
[u'ebooks.iospress.nl', u'IOS Press Ebooks'],
[u'hdl.handle.net', u'Handle Proxy'],
[u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'],
[u'img.mdpi.org', u'MDPI Books'],
[u'ledibooks.com', u'LediBooks'],
[u'leo.cilea.it', u'LEO '],
[u'leo.cineca.it', u'Letteratura Elettronica Online'],
[u'link.springer.com', u'Springer'],
[u'oapen.org', u'OAPEN Library'],
[u'press.openedition.org', u'OpenEdition Press'],
[u'windsor.scholarsportal.info', u'Scholars Portal'],
[u'www.adelaide.edu.au', u'University of Adelaide'],
[u'www.aliprandi.org', u'Simone Aliprandi'],
[u'www.antilia.to.it', u'antilia.to.it'],
[u'www.aupress.ca', u'Athabasca University Press'],
[u'www.bloomsburyacademic.com', u'Bloomsbury Academic'],
[u'www.co-action.net', u'Co-Action Publishing'],
[u'www.degruyter.com', u'De Gruyter Online'],
[u'www.doabooks.org', u'Directory of Open Access Books'],
[u'www.dropbox.com', u'Dropbox'],
[u'www.ebooks.iospress.nl', u'IOS Press Ebooks'],
[u'www.ledizioni.it', u'Ledizioni'],
[u'www.maestrantonella.it', u'maestrantonella.it'],
[u'www.oapen.org', u'OAPEN Library'],
[u'www.openbookpublishers.com', u'Open Book Publishers'],
[u'www.palgraveconnect.com', u'Palgrave Connect'],
[u'www.scribd.com', u'Scribd'],
[u'www.springerlink.com', u'Springer'],
[u'www.ubiquitypress.com', u'Ubiquity Press'],
[u'www.unimib.it', u'University of Milano-Bicocca'],
[u'www.unito.it', u"University of Turin"],
])
def url_to_provider(url):
netloc = urlparse.urlparse(url).netloc
return DOMAIN_TO_PROVIDER.get(netloc, netloc)
FRONTIERSIN = re.compile(r'frontiersin.org/books/[^/]+/(\d+)')
def online_to_download(url):
urls = []
if url.find(u'mdpi.com/books/pdfview/book/') >= 0:
doc = get_soup(url)
if doc:
obj = doc.find('object', type='application/pdf')
if obj:
urls.append(obj['data'].split('#')[0])
elif url.find(u'books.scielo.org/') >= 0:
doc = get_soup(url)
if doc:
obj = doc.find('a', class_='pdf_file')
if obj:
urls.append(urlparse.urljoin(url, obj['href']))
obj = doc.find('a', class_='epub_file')
if obj:
urls.append(urlparse.urljoin(url, obj['href']))
elif FRONTIERSIN.search(url):
booknum = FRONTIERSIN.search(url).group(1)
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum))
urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum))
else:
urls.append(url)
return urls

View File

@ -26,38 +26,54 @@ class HathitrustScraper(BaseScraper):
for record in records:
self.record = record
return
self.record = {}
self.record = None # probably a hdl not pointing at Hathitrust
self.record = None
def get_downloads(self):
dl_a = self.doc.select_one('#fullPdfLink')
value = dl_a['href'] if dl_a else None
if value:
self.set(
'download_url_{}'.format('pdf'),
'https://babel.hathitrust.org{}'.format(value)
)
if self.record:
dl_a = self.doc.select_one('#fullPdfLink')
value = dl_a['href'] if dl_a else None
if value:
self.set(
'download_url_{}'.format('pdf'),
'https://babel.hathitrust.org{}'.format(value)
)
return super(HathitrustScraper, self).get_downloads()
def get_isbns(self):
isbn = self.record.get('issn', [])
value = identifier_cleaner('isbn', quiet=True)(isbn)
return {'print': value} if value else {}
if self.record:
isbn = self.record.get('issn', [])
value = identifier_cleaner('isbn', quiet=True)(isbn)
return {'print': value} if value else {}
return super(HathitrustScraper, self).get_isbns()
def get_title(self):
self.set('title', self.record.get('title', ''))
if self.record:
self.set('title', self.record.get('title', ''))
return super(HathitrustScraper, self).get_title()
def get_keywords(self):
self.set('subjects', self.record.get('keywords', []))
if self.record:
self.set('subjects', self.record.get('keywords', []))
return super(HathitrustScraper, self).get_keywords()
def get_publisher(self):
self.set('publisher', self.record.get('publisher', ''))
if self.record:
self.set('publisher', self.record.get('publisher', ''))
return super(HathitrustScraper, self).get_publisher()
def get_pubdate(self):
self.set('publication_date', self.record.get('year', ''))
if self.record:
self.set('publication_date', self.record.get('year', ''))
return super(HathitrustScraper, self).get_pubdate()
def get_description(self):
notes = self.record.get('notes', [])
self.set('description', '\r'.join(notes))
if self.record:
notes = self.record.get('notes', [])
self.set('description', '\r'.join(notes))
return super(HathitrustScraper, self).get_description()
def get_genre(self):
self.set('genre', self.record.get('type_of_reference', '').lower())
if self.record:
self.set('genre', self.record.get('type_of_reference', '').lower())
return super(HathitrustScraper, self).get_genre()

View File

@ -110,15 +110,19 @@ class SpringerScraper(BaseScraper):
self.set('publisher', 'Springer')
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
def load_springer(num_pages):
def springer_open_books(num_pages):
for page in range(1, num_pages+1):
def load_springer(startpage=1, endpage=None):
def springer_open_books(startpage, endpage):
endpage = endpage if endpage else startpage + 10
for page in range(startpage, endpage + 1):
url = search_url.format(page)
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
base = response.url
doc = BeautifulSoup(response.content, 'lxml')
for link in doc.select('a.title'):
book_url = urljoin(base, link['href'])
yield SpringerScraper(book_url)
return add_from_bookdatas(springer_open_books(num_pages))
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
base = response.url
doc = BeautifulSoup(response.content, 'lxml')
for link in doc.select('a.title'):
book_url = urljoin(base, link['href'])
yield SpringerScraper(book_url)
except requests.exceptions.ConnectionError:
print 'couldn\'t connect to %s' % url
return add_from_bookdatas(springer_open_books(startpage, endpage))

28
core/loaders/tests.py Normal file
View File

@ -0,0 +1,28 @@
from django.conf import settings
from django.test import TestCase
from regluit.core.models import Ebook, Edition, Work
from .utils import dl_online
class LoaderTests(TestCase):
def setUp(self):
pass
def test_downloads(self):
if not (settings.TEST_INTEGRATION):
return
work = Work(title="online work")
work.save()
edition = Edition(work=work)
edition.save()
dropbox_url = 'https://www.dropbox.com/s/h5jzpb4vknk8n7w/Jakobsson_The_Troll_Inside_You_EBook.pdf?dl=0'
dropbox_ebook = Ebook.objects.create(format='online', url=dropbox_url, edition=edition)
dropbox_ebf = dl_online(dropbox_ebook)
self.assertTrue(dropbox_ebf.ebook.filesize)
jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
jbe_ebf = dl_online(jbe_ebook)
self.assertTrue(jbe_ebf.ebook.filesize)

View File

@ -1,15 +1,23 @@
import csv
import re
import requests
import logging
import sys
import re
import time
import unicodedata
import urlparse
from bs4 import BeautifulSoup
import requests
from django.conf import settings
from django.core.files.base import ContentFile
from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
from regluit.core.isbn import ISBN
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
from regluit.api.crosswalks import inv_relator_contrib
from regluit.bisac.models import BisacHeading
from regluit.core.bookloader import add_by_isbn_from_google, merge_works
from regluit.core.isbn import ISBN
from regluit.core.models import (
Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
)
logger = logging.getLogger(__name__)
@ -22,7 +30,7 @@ def utf8_general_ci_norm(s):
"""
Normalize a la MySQL utf8_general_ci collation
(As of 2016.05.24, we're using the utf8_general_ci collation for author names)
https://stackoverflow.com/questions/1036454/what-are-the-diffrences-between-utf8-general-ci-and-utf8-unicode-ci/1036459#1036459
* converts to Unicode normalization form D for canonical decomposition
@ -34,79 +42,84 @@ def utf8_general_ci_norm(s):
s1 = unicodedata.normalize('NFD', s)
return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()
def get_soup(url):
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
return BeautifulSoup(response.content, 'lxml')
return None
def get_authors(book):
authors=[]
if book.get('AuthorsList',''):
authors = []
if book.get('AuthorsList', ''):
#UMich
for i in range(1,3):
fname=u'Author{}First'.format(i)
lname=u'Author{}Last'.format(i)
role=u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname],book[lname])
for i in range(1, 3):
fname = u'Author{}First'.format(i)
lname = u'Author{}Last'.format(i)
role = u'Author{}Role'.format(i)
authname = u'{} {}'.format(book[fname], book[lname])
if authname != u' ':
role = book[role] if book[role].strip() else 'A01'
authors.append((authname,role))
authors.append((authname, role))
else:
break
authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
if len(authlist)>3:
if len(authlist) > 3:
for authname in authlist[3:]:
authors.append((authname, 'A01'))
else:
#OBP
for i in range(1,6):
fname= book.get(u'Contributor {} first name'.format(i), '')
lname= book.get(u'Contributor {} surname'.format(i), '')
role= book.get(u'ONIX Role Code (List 17){}'.format(i), '')
authname = u'{} {}'.format(fname,lname)
for i in range(1, 6):
fname = book.get(u'Contributor {} first name'.format(i), '')
lname = book.get(u'Contributor {} surname'.format(i), '')
role = book.get(u'ONIX Role Code (List 17){}'.format(i), '')
authname = u'{} {}'.format(fname, lname)
if authname != u' ':
role = role if role.strip() else 'A01'
authors.append((authname,role))
authors.append((authname, role))
else:
break
return authors
def get_subjects(book):
subjects=[]
for i in range(1,5):
subjects = []
for i in range(1, 5):
key = u'BISACCode{}'.format(i) #UMich dialect
key2 = u'BISAC subject code {}'.format(i) #OBP dialect
code = book.get(key,'')
code = code if code else book.get(key2,'')
code = book.get(key, '')
code = code if code else book.get(key2, '')
if code != '':
try:
bisac=BisacHeading.objects.get(notation=code)
bisac = BisacHeading.objects.get(notation=code)
subjects.append(bisac)
except BisacHeading.DoesNotExist:
logger.warning( "Please add BISAC {}".format(code))
logger.warning("Please add BISAC {}".format(code))
return subjects
def add_subject(subject_name, work, authority=''):
try:
subject= Subject.objects.get(name=subject_name)
subject = Subject.objects.get(name=subject_name)
except Subject.DoesNotExist:
subject=Subject.objects.create(name=subject_name, authority=authority)
subject = Subject.objects.create(name=subject_name, authority=authority)
subject.works.add(work)
def get_title(book):
title = book.get('FullTitle','') #UMICH
title = book.get('FullTitle', '') #UMICH
if title:
return title
title = book.get('Title','') #OBP
sub = book.get('Subtitle','')
title = book.get('Title', '') #OBP
sub = book.get('Subtitle', '')
if sub:
return u'{}: {}'.format(title,sub)
else:
return title
return u'{}: {}'.format(title, sub)
return title
def get_cover(book):
cover_url = book.get('Cover URL','') #OBP
cover_url = book.get('Cover URL', '') #OBP
if cover_url:
return cover_url
url = book['URL']
if "10.3998" in url:
# code for umich books; can generalize, of course!
idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url)
if idmatch:
book_id = idmatch.group(2)
if idmatch.group(1) == 'ohp':
@ -116,74 +129,78 @@ def get_cover(book):
else:
cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
cover = requests.head(cover_url)
if cover.status_code<400:
if cover.status_code < 400:
return cover_url
else:
logger.warning( "bad cover: {} for: {}".format(cover_url, url))
logger.warning("bad cover: {} for: {}".format(cover_url, url))
def get_isbns(book):
isbns = []
edition = None
#'ISBN 1' is OBP, others are UMICH
for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
if book.get(code, '') not in ('','N/A'):
for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
'ISBN 1', 'ISBN 4', 'ISBN 5'
]:
if book.get(code, '') not in ('', 'N/A'):
values = book[code].split(',')
for value in values:
isbn = ISBN(value).to_string()
if isbn:
isbns.append(isbn)
for isbn in isbns :
for isbn in isbns:
if not edition:
edition = Edition.get_by_isbn(isbn)
return (isbns, edition )
return (isbns, edition)
def get_pubdate(book):
value = book.get('CopyrightYear','') #UMICH
value = book.get('CopyrightYear', '') #UMICH
if value:
return value
value = book.get('publication year','') #OBP
sub = book.get('publication month','')
sub2 = book.get('publication day','')
value = book.get('publication year', '') #OBP
sub = book.get('publication month', '')
sub2 = book.get('publication day', '')
if sub2:
return u'{}-{}-{}'.format(value,sub,sub2)
return u'{}-{}-{}'.format(value, sub, sub2)
elif sub:
return u'{}-{}'.format(value,sub,sub2)
else:
return value
return u'{}-{}'.format(value, sub, sub2)
return value
def get_publisher(book):
value = book.get('Publisher','')
value = book.get('Publisher', '')
if value:
return value
if book.get('DOI prefix','')=='10.11647':
if book.get('DOI prefix', '') == '10.11647':
return "Open Book Publishers"
def get_url(book):
url = book.get('URL','')
url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
url = book.get('URL', '')
url = url if url else u'https://doi.org/{}/{}'.format(
book.get('DOI prefix', ''),
book.get('DOI suffix', '')
)
return url
def get_description(book):
value = book.get('DescriptionBrief','')
value = value if value else book.get('Plain Text Blurb','')
value = book.get('DescriptionBrief', '')
value = value if value else book.get('Plain Text Blurb', '')
return value
def get_language(book):
value = book.get('ISO Language Code','')
value = book.get('ISO Language Code', '')
return value
def load_from_books(books):
''' books is an iterator of book dicts.
each book must have attributes
(umich dialect)
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
SubjectListMARC, , Book-level DOI, URL, License
'''
# Goal: get or create an Edition and Work for each given book
@ -194,21 +211,21 @@ def load_from_books(books):
# try first to get an Edition already in DB with by one of the ISBNs in book
(isbns, edition) = get_isbns(book)
if len(isbns)==0:
if not isbns:
continue
title=get_title(book)
title = get_title(book)
authors = get_authors(book)
# if matching by ISBN doesn't work, then create a Work and Edition
# if matching by ISBN doesn't work, then create a Work and Edition
# with a title and the first ISBN
if not edition:
work = Work(title=title)
work.save()
edition= Edition(title=title, work=work)
edition = Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)
work=edition.work
work = edition.work
# at this point, work and edition exist
url = get_url(book)
@ -222,7 +239,7 @@ def load_from_books(books):
if edition and edition.work != work:
work = merge_works(work, edition.work)
if not edition:
edition= Edition(title=title, work=work)
edition = Edition(title=title, work=work)
edition.save()
Identifier.set(type='isbn', value=isbn, edition=edition, work=work)
@ -234,18 +251,18 @@ def load_from_books(books):
edition.save()
edition.set_publisher(get_publisher(book))
# possibly replace work.description
# possibly replace work.description
description = get_description(book)
if len(description)>len (work.description):
if len(description) > len(work.description):
work.description = description
work.save()
# set language
lang= get_language(book)
lang = get_language(book)
if lang:
work.language = lang
work.save()
# add a bisac subject (and ancestors) to work
for bisacsh in get_subjects(book):
while bisacsh:
@ -258,13 +275,13 @@ def load_from_books(books):
results.append((book, work, edition))
try:
logger.info (u"{} {} {}\n".format(i, title, loading_ok))
logger.info(u"{} {} {}\n".format(i, title, loading_ok))
except Exception as e:
logger.info (u"{} {}\n".format(i, title, str(e) ))
logger.info(u"{} {} {}\n".format(i, title, str(e)))
return results
def loaded_book_ok(book, work, edition):
isbns = get_isbns(book)[0]
@ -277,10 +294,10 @@ def loaded_book_ok(book, work, edition):
try:
url_id = Identifier.objects.get(type='http', value=get_url(book))
if url_id is None:
logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
return False
except Exception as e:
logger.info (str(e))
logger.info(str(e))
return False
# isbns
@ -292,15 +309,17 @@ def loaded_book_ok(book, work, edition):
try:
edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
except Exception as e:
print (e)
logger.info(e)
return False
# authors
# print set([ed.name for ed in edition_for_isbn.authors.all()])
if (set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
print "problem with authors"
if (
set([utf8_general_ci_norm(author[0]) for author in authors]) !=
set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
):
logger.info("problem with authors")
return False
try:
@ -312,7 +331,7 @@ def loaded_book_ok(book, work, edition):
# work description
description = get_description(book)
if not ((work.description == description) or (len(description) <len (work.description))):
if not ((work.description == description) or (len(description) < len(work.description))):
return False
# bisac
@ -331,14 +350,15 @@ def loaded_book_ok(book, work, edition):
return True
ID_URLPATTERNS = {
'goog': re.compile(r'[\./]google\.com/books\?.*id=([a-zA-Z0-9\-_]{12})'),
'olwk': re.compile(r'[\./]openlibrary\.org(/works/OL\d{1,8}W)'),
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(\d{1,8})'),
'ltwk': re.compile(r'[\./]librarything\.com/work/(\d{1,8})'),
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(\d{8,12})'),
'doi': re.compile(r'[\./]doi\.org/(10\.\d+/\S+)'),
'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(\d{1,6})'),
'glue': re.compile(r'[\./]unglue\.it/work/(\d{1,7})'),
'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P<id>[a-zA-Z0-9\-_]{12})'),
'olwk': re.compile(r'[\./]openlibrary\.org(?P<id>/works/OL\d{1,8}W)'),
'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P<id>\d{1,8})'),
'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P<id>\d{1,8})'),
'ltwk': re.compile(r'[\./]librarything\.com/work/(?P<id>\d{1,8})'),
'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P<id>\d{8,12})'),
'doi': re.compile(r'[\./]doi\.org/(?P<id>10\.\d+/\S+)'),
'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(?P<id>\d{1,6})'),
'glue': re.compile(r'[\./]unglue\.it/work/(?P<id>\d{1,7})'),
}
def ids_from_urls(url):
@ -346,7 +366,128 @@ def ids_from_urls(url):
for ident in ID_URLPATTERNS.keys():
id_match = ID_URLPATTERNS[ident].search(url)
if id_match:
ids[ident] = id_match.group(1)
ids[ident] = id_match.group('id')
return ids
DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
def dl_online(ebook):
if ebook.format != 'online':
pass
elif ebook.url.find(u'dropbox.com/s/') >= 0:
response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
match_dl = DROPBOX_DL.search(response.content)
if match_dl:
return make_dl_ebook(match_dl.group(1), ebook)
else:
logger.warning('couldn\'t get {}'.format(ebook.url))
else:
logger.warning('couldn\'t get dl for {}'.format(ebook.url))
elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
doc = get_soup(ebook.url)
if doc:
obj = doc.select_one('div.fulltexticoncontainer-PDF a')
if obj:
dl_url = urlparse.urljoin(ebook.url, obj['href'])
return make_dl_ebook(dl_url, ebook)
else:
logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
else:
logger.warning('couldn\'t get soup for {}'.format(ebook.url))
return None, False
def make_dl_ebook(url, ebook):
if EbookFile.objects.filter(source=ebook.url):
return EbookFile.objects.filter(source=ebook.url)[0], False
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
filesize = int(response.headers.get("Content-Length", 0))
filesize = filesize if filesize else None
format = type_for_url(url, content_type=response.headers.get('content-type'))
if format != 'online':
new_ebf = EbookFile.objects.create(
edition=ebook.edition,
format=format,
source=ebook.url,
)
new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
new_ebf.save()
new_ebook = Ebook.objects.create(
edition=ebook.edition,
format=format,
provider='Unglue.it',
url=new_ebf.file.url,
rights=ebook.rights,
filesize=filesize,
version_label=ebook.version_label,
version_iter=ebook.version_iter,
)
new_ebf.ebook = new_ebook
new_ebf.save()
return new_ebf, True
else:
logger.warning('download format for {} is not ebook'.format(url))
else:
logger.warning('couldn\'t get {}'.format(url))
return None, False
def type_for_url(url, content_type=None):
if not url:
return ''
if url.find('books.openedition.org') >= 0:
return 'online'
if Ebook.objects.filter(url=url):
return Ebook.objects.filter(url=url)[0].format
ct = content_type if content_type else contenttyper.calc_type(url)
if re.search("pdf", ct):
return "pdf"
elif re.search("octet-stream", ct) and re.search("pdf", url, flags=re.I):
return "pdf"
elif re.search("octet-stream", ct) and re.search("epub", url, flags=re.I):
return "epub"
elif re.search("text/plain", ct):
return "text"
elif re.search("text/html", ct):
if url.find('oapen.org/view') >= 0:
return "html"
return "online"
elif re.search("epub", ct):
return "epub"
elif re.search("mobi", ct):
return "mobi"
return "other"
class ContentTyper(object):
""" """
def __init__(self):
self.last_call = dict()
def content_type(self, url):
try:
r = requests.head(url)
return r.headers.get('content-type', '')
except:
return ''
def calc_type(self, url):
delay = 1
# is there a delay associated with the url
netloc = urlparse.urlparse(url).netloc
# wait if necessary
last_call = self.last_call.get(netloc)
if last_call is not None:
now = time.time()
min_time_next_call = last_call + delay
if min_time_next_call > now:
time.sleep(min_time_next_call-now)
self.last_call[netloc] = time.time()
# compute the content-type
return self.content_type(url)
contenttyper = ContentTyper()

View File

@ -5,18 +5,18 @@ from regluit.core.models import Work
from regluit.core.loaders.doab import update_cover_doab
class Command(BaseCommand):
help = "make covers for doab editions"
help = "make covers for doab editions with bad covers"
def handle(self, **options):
works = Work.objects.filter(selected_edition__isnull=False, selected_edition__cover_image__isnull=True)
#.filter(selected_edition__isnull=False, selected_edition__cover_image__isnull=True)
#.exclude(selected_edition__identifiers__type='goog')
added = 0
for (i, work) in enumerate(works):
if work.doab and work.selected_edition.googlebooks_id == '':
update_cover_doab(work.doab, work.selected_edition)
added += 1
print ('\r {}:{}'.format(i, added), end='')
print('added {} covers'.format(added))
works = Work.objects.filter(identifiers__type='doab').distinct()
print('checking {} works with doab'.format(works.count()))
num = 0
for work in works:
if not work.cover_image_thumbnail():
update_cover_doab(work.doab, work.preferred_edition, store_cover=True)
#print(work.doab)
num += 1
if num % 10 == 0:
print('{} doab covers updated'.format(num))
#break
print('Done: {} doab covers updated'.format(num))

View File

@ -1,6 +1,7 @@
from django.core.management.base import BaseCommand
from regluit.core.models import Subject
from regluit.core.validation import valid_subject
@ -27,3 +28,8 @@ class Command(BaseCommand):
for work in subject.works.all():
Subject.set_by_name(subject.name, work=work)
subject.delete()
period_subjects = Subject.objects.filter(name__contains=".")
for subject in period_subjects:
if not valid_subject(subject.name):
subject.delete()

View File

@ -1,17 +0,0 @@
import os
from django.conf import settings
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
from regluit.core.loaders import doab
class Command(BaseCommand):
help = "load doab auths"
args = "<limit> <file_name>"
def handle(self, limit=None, file_name="../../../bookdata/doab_auths.json", **options):
command_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(command_dir, file_name)
doab.load_doab_auths(file_path, limit=int(limit) if limit else None)

View File

@ -1,17 +0,0 @@
import os
from django.conf import settings
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
from regluit.core.loaders import doab
class Command(BaseCommand):
help = "load doab books"
args = "<limit> <file_name>"
def handle(self, limit=None, file_name="../../../bookdata/doab.json", **options):
command_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(command_dir, file_name)
doab.load_doab_records(file_path, limit=int(limit))

View File

@ -0,0 +1,21 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders.utils import dl_online
from regluit.core.models import Ebook
class Command(BaseCommand):
help = "harvest downloadable ebooks from 'online' ebooks"
args = "<limit>"
def handle(self, limit=0, **options):
limit = int(limit) if limit else 0
onlines = Ebook.objects.filter(format='online')
done = 0
for online in onlines:
new_ebf, new = dl_online(online)
if new_ebf and new:
done += 1
if done > limit:
break
print 'harvested {} ebooks'.format(done)

View File

@ -30,9 +30,9 @@ class Command(BaseCommand):
books = []
for sitemap in content:
added = add_by_sitemap(sitemap.strip(), maxnum=max)
max = max - len(added)
max = max - len(added) if max else max
books = books + added
if max < 0:
if max and max < 0:
break
else:
books = add_by_sitemap(url, maxnum=max)

View File

@ -4,9 +4,9 @@ from regluit.core.loaders.springer import load_springer
class Command(BaseCommand):
help = "load books from springer open"
args = "<pages>"
args = "<startpage> <endpage>"
def handle(self, pages, **options):
books = load_springer(int(pages))
def handle(self, startpage, endpage=0, **options):
books = load_springer(int(startpage), int(endpage))
print "loaded {} books".format(len(books))

View File

@ -0,0 +1,10 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders import doab
class Command(BaseCommand):
help = "load doab books by doab_id via oai"
args = "<doab_id>"
def handle(self, doab_id, **options):
doab.add_by_doab(doab_id)

View File

@ -0,0 +1,18 @@
from django.core.management.base import BaseCommand
from regluit.core.loaders import doab
class Command(BaseCommand):
help = "load doab books via oai"
args = "<from_year> <limit>"
def handle(self, from_year= None, limit=None, **options):
from_year = int(from_year) if from_year else None
limit = int(limit) if limit else None
if limit:
doab.load_doab_oai(from_year=from_year, limit=limit)
else:
if from_year:
doab.load_doab_oai(from_year=from_year)
else:
doab.load_doab_oai()

View File

@ -4,9 +4,9 @@ from random import randint, randrange
from django.conf import settings
from django.core.management.base import BaseCommand
from django.utils.timezone import now
from regluit.core.models import Work, Campaign
from regluit.utils.localdatetime import now
class Command(BaseCommand):
help = "creates random campaigns for any works that lack one for testing"

View File

@ -26,6 +26,7 @@ from django.core.files.base import ContentFile
from django.db import models
from django.db.models import F, Q
from django.db.models.signals import post_save
from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _
#regluit imports
@ -45,8 +46,9 @@ from regluit.payment.parameters import (
TRANSACTION_STATUS_FAILED,
TRANSACTION_STATUS_INCOMPLETE
)
from regluit.utils import encryption as crypto
from regluit.utils.localdatetime import now, date_today
from regluit.utils.localdatetime import date_today
from regluit.core.parameters import (
REWARDS,

View File

@ -20,10 +20,10 @@ from django.core.urlresolvers import reverse
from django.db import models
from django.db.models import F
from django.db.models.signals import post_save, pre_delete
from django.utils.timezone import now
import regluit
from regluit.marc.models import MARCRecord as NewMARC
from regluit.utils.localdatetime import now
from questionnaire.models import Landing
from regluit.core import mobi
@ -1082,8 +1082,7 @@ class EbookFile(models.Model):
asking=self.asking,
source=self.file.url
)
new_mobi_ebf.file.save(path_for_file('ebf', None), mobi_cf)
new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
new_mobi_ebf.save()
if self.ebook:
new_ebook = Ebook.objects.create(

View File

@ -42,7 +42,7 @@ OTHER_ID_CHOICES = (
('edid', 'pragmatic edition ID'),
)
WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http')
WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab')
ID_CHOICES_MAP = dict(ID_CHOICES)

View File

@ -22,6 +22,7 @@ from django.db.utils import DatabaseError
from django.dispatch import Signal
from django.utils.translation import ugettext_noop as _
from django.template.loader import render_to_string
from django.utils.timezone import now
from notification import models as notification
@ -29,9 +30,9 @@ from notification import models as notification
regluit imports
"""
from regluit.payment.signals import transaction_charged, transaction_failed, pledge_modified, pledge_created
from regluit.utils.localdatetime import now, date_today
from regluit.core.parameters import REWARDS, BUY2UNGLUE, THANKS, LIBRARY, RESERVE, THANKED
from regluit.libraryauth.models import Library, LibraryUser
from regluit.utils.localdatetime import date_today
logger = logging.getLogger(__name__)
@ -100,7 +101,7 @@ def create_notice_types( **kwargs):
notification.create_notice_type("purchase_notgot_gift", _("Your gift wasn't received."), _("The ebook you sent as a gift has not yet been redeemed."))
notification.create_notice_type("donation", _("Your donation was processed."), _("Thank you, your generous donation has been processed."))
signals.post_syncdb.connect(create_notice_types, sender=notification)
signals.post_migrate.connect(create_notice_types, sender=notification)
# define the notifications and tie them to corresponding signals

View File

@ -13,6 +13,7 @@ django imports
from django.conf import settings
from django.contrib.auth.models import User
from django.core.mail import send_mail
from django.utils.timezone import now
from notification.engine import send_all
from notification import models as notification
@ -29,8 +30,7 @@ from regluit.core import (
from regluit.core.models import Campaign, Acq, Gift
from regluit.core.signals import deadline_impending
from regluit.core.parameters import RESERVE, REWARDS, THANKS
from regluit.utils.localdatetime import now, date_today
from regluit.utils.localdatetime import date_today
logger = logging.getLogger(__name__)

File diff suppressed because it is too large Load Diff

View File

@ -19,7 +19,7 @@ ID_VALIDATION = {
'http': (re.compile(r"(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$",
flags=re.IGNORECASE|re.S),
"The Web Address must be a valid http(s) URL."),
'isbn': (r'^([\dxX\-–— ]+|delete)$',
'isbn': (u'^([\\dxX \\-–—‐,;]+|delete)$', #includes unicode hyphen, endash and emdash
"The ISBN must be a valid ISBN-13."),
'doab': (r'^(\d{1,6}|delete)$',
"The value must be 1-6 digits."),
@ -44,8 +44,6 @@ ID_VALIDATION = {
}
def isbn_cleaner(value):
if value == 'delete':
return value
if not value:
raise ValidationError('no identifier value found')
elif value == 'delete':
@ -132,6 +130,8 @@ def valid_xml_char_ordinal(c):
)
def valid_subject(subject_name):
if len(subject_name) > 200:
return False
num_commas = 0
for c in subject_name:
if not valid_xml_char_ordinal(c):
@ -140,6 +140,10 @@ def valid_subject(subject_name):
num_commas += 1
if num_commas > 2:
return False
if len(subject_name.split('--')) > 6:
return False
if len(subject_name.split('. ')) > 4:
return False
return True
reverse_name_comma = re.compile(r',(?! *Jr[\., ])')

View File

@ -149,14 +149,27 @@ class EditionForm(forms.ModelForm):
id_type = self.cleaned_data['id_type']
id_value = self.cleaned_data.get('id_value','').strip()
if id_value:
identifier = Identifier.objects.filter(type=id_type, value=id_value)
if identifier:
err_msg = "{} is a duplicate for work #{}.".format(identifier[0], identifier[0].work_id)
self.add_error('id_value', forms.ValidationError(err_msg))
try:
self.cleaned_data['id_value'] = identifier_cleaner(id_type)(id_value)
id_value = identifier_cleaner(id_type)(id_value)
identifier = Identifier.objects.filter(type=id_type, value=id_value)
ident = identifier[0] if identifier else None
if not ident or not self.instance:
self.cleaned_data['id_value'] = id_value
elif ident.edition_id == self.instance.id:
self.cleaned_data['id_value'] = id_value
elif not ident.edition_id and ident.work_id == self.instance.work_id:
self.cleaned_data['id_value'] = id_value
else:
if ident.edition_id:
err_msg = "{} is a duplicate for edition #{}.".format(id_value, ident.edition_id)
else:
err_msg = "{} is a duplicate for work #{}.".format(id_value, ident.work_id)
self.add_error('id_value', forms.ValidationError(err_msg))
except forms.ValidationError, ve:
self.add_error('id_value', forms.ValidationError('{}: {}'.format(ve.message, id_value)))
self.add_error(
'id_value',
forms.ValidationError('{}: {}'.format(ve.message, id_value))
)
return self.cleaned_data
class Meta:

View File

@ -13,11 +13,11 @@ from django.conf import settings
from django.forms.extras.widgets import SelectDateWidget
from django.forms.widgets import RadioSelect
from django.utils.translation import ugettext_lazy as _
from django.utils.timezone import now
from regluit.core.lookups import OwnerLookup
from regluit.core.models import Campaign, Edition, Claim, RightsHolder, WasWork
from regluit.core.parameters import *
from regluit.utils.localdatetime import now
class RightsHolderForm(forms.ModelForm):
email = forms.EmailField(

View File

@ -171,7 +171,7 @@
<div class="column show-for-medium">
<span>Contact</span>
<ul>
<li> <a href="mailto:info@ebookfoundation.org"><i class="fa fa-envelope fa-2x"></i></a> <a href="https://twitter.com/unglueit"><i class="fa fa-twitter fa-2x"></i></a> <a href="https://facebook/com/unglueit"><i class="fa fa-facebook fa-2x"></i></a></li>
<li> <a href="mailto:info@ebookfoundation.org"><i class="fa fa-envelope fa-2x"></i></a> <a href="https://twitter.com/unglueit"><i class="fa fa-twitter fa-2x"></i></a> <a href="https://facebook.com/unglueit"><i class="fa fa-facebook fa-2x"></i></a></li>
</ul>
</div>
</div>

View File

@ -1,6 +1,6 @@
{% extends 'work_list.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load lang_utils %}
{% block title %} Works published by {{ pubname }} {% endblock %}

View File

@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load lang_utils %}
{% load sass_tags %}

View File

@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load lang_utils %}
{% load sass_tags %}

View File

@ -4,6 +4,7 @@
{% block doccontent %}
<h2>Rights Holder Claim Form </h2>
{% if work %}
<h3> Rightsholder making claim </h3>
{{ rights_holder.rights_holder_name }}
<h3> Work being claimed </h3>
@ -42,4 +43,7 @@
<input type="submit" name="submit" value="Confirm Claim">
</form>
{% endif %}
{% else %}
Please find a work to claim.
{% endif %}
{% endblock %}

View File

@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load lang_utils %}
{% load sass_tags %}

View File

@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load sass_tags %}
{% load truncatechars %}

View File

@ -1,6 +1,6 @@
{% extends 'work_list.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load lang_utils %}
{% block title %} Books we're recommending. {% endblock %}

View File

@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load truncatechars %}
{% load sass_tags %}

View File

@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load lang_utils %}
{% load sass_tags %}

View File

@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% load endless %}
{% load el_pagination_tags %}
{% load lang_utils %}
{% load sass_tags %}

View File

@ -1,5 +1,6 @@
from django import template
from regluit.utils.localdatetime import now
from django.utils.timezone import now
from regluit.core.parameters import REWARDS, BUY2UNGLUE
register = template.Library()

View File

@ -1,12 +1,6 @@
"""
The truncatechars filter is part of Django dev, but we're on 1.3.1
The following is the filter and its dependencies
To use this filter, put "{% load truncatechars %}" at the beginning of your template,
then {{ myvariable|truncatechars:num }}
"""
import unicodedata
from django.template.base import Library
from django.template import Library
from django.template.defaultfilters import stringfilter
from django.utils.translation import get_language_info

View File

@ -1,4 +1,4 @@
from regluit.utils.localdatetime import now
from django.utils.timezone import now
from django import template
register = template.Library()

View File

@ -1,5 +1,6 @@
from regluit.utils.localdatetime import now
from django import template
from django.utils.timezone import now
from regluit.core.models import Acq
register = template.Library()

View File

@ -7,7 +7,7 @@ then {{ myvariable|truncatechars:num }}
import unicodedata
from django import template
from django.template.base import Library
from django.template import Library
from django.template.defaultfilters import stringfilter
from django.utils.encoding import force_unicode
from django.utils.functional import allow_lazy, SimpleLazyObject

View File

@ -3,7 +3,7 @@
"""
from urllib import unquote
from django.template.base import Library
from django.template import Library
from django.template.defaultfilters import stringfilter
register = Library()

View File

@ -13,6 +13,7 @@ from django.core import mail
from django.core.urlresolvers import reverse
from django.test import TestCase
from django.test.client import Client
from django.utils.timezone import now
from notification.models import Notice
@ -21,7 +22,6 @@ from regluit.core.models import Work, Campaign, RightsHolder, Claim, Subject
from regluit.payment.models import Transaction
from regluit.payment.manager import PaymentManager
from regluit.payment.stripelib import StripeClient, TEST_CARDS, ERROR_TESTING, card
from regluit.utils.localdatetime import now
class WishlistTests(TestCase):
fixtures = ['initial_data.json', 'neuromancer.json']

View File

@ -35,7 +35,7 @@ urlpatterns = [
url(r"^rightsholders/campaign/(?P<id>\d+)/mademobi/$", views.manage_campaign, {'action': 'mademobi'}, name="mademobi"),
url(r"^rightsholders/edition/(?P<work_id>\d*)/(?P<edition_id>\d*)$", views.edit_edition, {'by': 'rh'}, name="rh_edition"),
url(r"^rightsholders/edition/(?P<edition_id>\d*)/upload/$", views.edition_uploads, name="edition_uploads"),
url(r"^rightsholders/claim/$", views.claim, name="claim"),
url(r"^rightsholders/claim/$", login_required(views.claim), name="claim"),
url(r"^rightsholders/surveys/$", views.surveys, name="surveys"),
url(r"^rightsholders/new_survey/(?P<work_id>\d*)/?$", views.new_survey, name="new_survey"),
url(r"^rightsholders/surveys/answers_(?P<qid>\d+)_(?P<work_id>\d*).csv$", views.export_surveys, name="survey_answers"),

View File

@ -45,6 +45,7 @@ from django.template import TemplateDoesNotExist
from django.template.loader import render_to_string
from django.utils.http import urlencode
from django.utils.translation import ugettext_lazy as _
from django.utils.timezone import now
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_POST
from django.views.generic.edit import FormView
@ -123,11 +124,11 @@ from regluit.payment.parameters import (
COMPANY_TITLE
)
from regluit.utils.localdatetime import now, date_today
from regluit.libraryauth.forms import UserNamePass
from regluit.libraryauth.views import Authenticator, superlogin, login_user
from regluit.libraryauth.models import Library
from regluit.marc.views import qs_marc_records
from regluit.utils.localdatetime import date_today
from questionnaire.models import Landing, Questionnaire
from questionnaire.views import export_summary as answer_summary, export_csv as export_answers

View File

@ -21,6 +21,7 @@ from regluit.core.bookloader import (
from regluit.core.parameters import WORK_IDENTIFIERS
from regluit.core.loaders import add_by_webpage
from regluit.core.loaders.doab import add_by_doab
from regluit.core.loaders.utils import ids_from_urls
from regluit.frontend.forms import EditionForm, IdentifierForm
@ -106,6 +107,11 @@ def get_edition_for_id(id_type, id_value, user=None):
if edition:
return user_edition(edition, user)
if identifiers.has_key('doab'):
edition = add_by_doab(identifiers['doab'])
if edition:
return user_edition(edition, user)
if identifiers.has_key('oclc'):
edition = add_by_oclc(identifiers['oclc'])
if edition:
@ -296,11 +302,17 @@ def edit_edition(request, work_id, edition_id, by=None):
id_type = form.cleaned_data['id_type']
id_val = form.cleaned_data['id_value']
if id_val == 'delete':
if edition.identifiers.exclude(type=id_type):
edition.identifiers.filter(type=id_type).delete()
if id_val == 'delete':
if id_type in WORK_IDENTIFIERS:
if edition.work.identifiers.exclude(type=id_type):
edition.work.identifiers.filter(type=id_type).delete()
else:
alert = ('Can\'t delete identifier - must have at least one left.')
else:
alert = ('Can\'t delete identifier - must have at least one left.')
if edition.identifiers.exclude(type=id_type):
edition.identifiers.filter(type=id_type).delete()
else:
alert = ('Can\'t delete identifier - must have at least one left.')
elif id_val:
models.Identifier.set(
type=id_type,

View File

@ -88,6 +88,8 @@ class ClaimView(CreateView):
return HttpResponseRedirect(reverse('rightsholders'))
def get_context_data(self, form):
if not form.is_valid():
return {'form': form}
work = form.cleaned_data['work']
rights_holder = form.cleaned_data['rights_holder']
active_claims = work.claim.exclude(status = 'release')

View File

@ -1 +1,9 @@
from . import signals
from django.apps import AppConfig
default_app_config = 'regluit.libraryauth.LibraryAuthConfig'
class LibraryAuthConfig(AppConfig):
name = 'regluit.libraryauth'
def ready(self):
from . import signals

View File

@ -8,7 +8,7 @@ from django.core import validators
from django.db import models
from django.db.models import Q
from django.db.models.signals import post_save
from django.forms import IPAddressField as BaseIPAddressField
from django.forms import GenericIPAddressField as BaseIPAddressField
from django.utils.translation import ugettext_lazy as _
from django.core.urlresolvers import reverse

View File

@ -1,6 +1,6 @@
import unicodedata
from django.template.base import Library
from django.template import Library
from .. import models
register = Library()

View File

@ -10,12 +10,12 @@ from datetime import timedelta
django imports
"""
from django.http import HttpResponseForbidden
from django.utils.timezone import now
"""
regluit imports
"""
from regluit.payment.models import PaymentResponse
from regluit.utils.localdatetime import now, zuluformat
class ProcessorError(Exception):
"""An abstraction around payment processor exceptions"""

View File

@ -18,6 +18,7 @@ django imports
from django.conf import settings
from django.contrib.auth.models import User
from django.core.urlresolvers import reverse
from django.utils.timezone import now
"""
regluit imports
@ -26,7 +27,6 @@ from regluit.payment import credit
from regluit.payment.models import Transaction, Receiver, PaymentResponse, Account
from regluit.payment.parameters import *
from regluit.payment.signals import transaction_charged, pledge_modified, pledge_created
from regluit.utils.localdatetime import now
logger = logging.getLogger(__name__)

View File

@ -18,6 +18,7 @@ from django.db.models import Q
from django.contrib.sites.models import Site
from django.db.models.signals import post_save, post_delete
from django.utils.http import urlquote
from django.utils.timezone import now
## django module imports
@ -42,7 +43,7 @@ from regluit.payment.parameters import (
)
from regluit.payment.signals import credit_balance_added, pledge_created
from regluit.utils.localdatetime import now, date_today
from regluit.utils.localdatetime import date_today
logger = logging.getLogger(__name__)

View File

@ -6,12 +6,15 @@ external library imports
"""
import logging
import json
import re
import stripe
from datetime import datetime, timedelta
from itertools import islice
from pytz import utc
import re
import unittest
from unittest import TestCase
import stripe
"""
django imports
@ -19,6 +22,7 @@ django imports
from django.conf import settings
from django.core.mail import send_mail
from django.http import HttpResponse
from django.utils.timezone import now
"""
regluit imports
@ -35,7 +39,6 @@ from regluit.payment.parameters import (
TRANSACTION_STATUS_CANCELED
)
from regluit.payment.signals import transaction_charged, transaction_failed
from regluit.utils.localdatetime import now, zuluformat
# as of 2013.07.15
# ['charge.disputed', 'coupon.updated'] are legacy events -- don't know whether to
@ -73,12 +76,6 @@ def grouper(iterable, page_size):
class StripelibError(baseprocessor.ProcessorError):
pass
try:
import unittest
from unittest import TestCase
except:
from django.test import TestCase
from django.utils import unittest
# if customer.id doesn't exist, create one and then charge the customer
# we probably should ask our users whether they are ok with our creating a customer id account -- or ask for credit

View File

@ -5,6 +5,7 @@ import logging
import os
import time
import traceback
import unittest
from datetime import timedelta
from decimal import Decimal as D
@ -19,7 +20,7 @@ from django.contrib.auth.models import User
from django.core.exceptions import ValidationError
from django.core.validators import URLValidator
from django.test import TestCase
from django.utils import unittest
from django.utils.timezone import now
"""
regluit imports
@ -29,7 +30,6 @@ from regluit.core.signals import handle_transaction_charged
from regluit.payment.manager import PaymentManager
from regluit.payment.models import Transaction, Account
from regluit.payment.parameters import *
from regluit.utils.localdatetime import now
def setup_selenium():
# Set the display window for our xvfb

View File

@ -13,7 +13,7 @@ django imports
"""
from django.conf import settings
from django.contrib.auth.models import User
from django.contrib.sites.models import RequestSite
from django.contrib.sites.requests import RequestSite
from django.core.urlresolvers import reverse
from django.http import (
HttpResponse,
@ -24,6 +24,7 @@ from django.http import (
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.test.utils import setup_test_environment
from django.utils.timezone import now
from django.views.decorators.csrf import csrf_exempt
from django.views.generic.edit import FormView
from django.views.generic.base import TemplateView
@ -38,7 +39,6 @@ from regluit.payment.models import Transaction
from regluit.payment.parameters import *
from regluit.payment.stripelib import STRIPE_PK
from regluit.payment.tests import PledgeTest, AuthorizeTest
from regluit.utils.localdatetime import now
logger = logging.getLogger(__name__)

View File

@ -3,19 +3,14 @@ Fabric==1.6.0
MySQL-python==1.2.5
Pillow==3.4.2
PyJWT==1.4.1
PyPDF2==1.23
PyPDF2==1.26
PyGithub==1.15.0
PyYAML==3.11
git+git://github.com/urschrei/pyzotero.git@v0.9.51
SPARQLWrapper==1.6.4
WebOb==1.2.3
WebTest==1.4.0
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.23
awscli==1.10.26
boto==2.42.0
#git+ssh://git@github.com/Gluejar/boto.git@2.3.0
celery==3.1.23
certifi==2016.2.28
# pip installing pillow seems to delete distribute
@ -24,36 +19,34 @@ certifi==2016.2.28
django-celery==3.1.17
django-ckeditor==4.5.1
#django-email-change==0.2.3
git+git://github.com/eshellman/django-email-change.git@1e71dd320504d56b1fc7d447ce4cffb550cedce7
git+git://github.com/eshellman/django-email-change.git@57169bdef1c8a41d122e2bab2dcd8564b8fb231d
django-compat==1.0.10
django-contrib-comments==1.7.1
django-endless-pagination==2.0
django-el-pagination==3.2.4
django-extensions==1.6.1
django-jsonfield==1.0.0
#django-kombu==0.9.4
django-maintenancemode==0.11.2
django-mptt==0.8.5
#django-nose-selenium==0.7.3
#django-notification==0.2
git+git://github.com/eshellman/django-notification.git@412c7a03a327195a1017c2be92c8e2caabc880b6
git+git://github.com/eshellman/django-notification.git@a4620e893e2da220994e0189bf5d980bfbdcf0ad
django-registration==2.1.2
django-selectable==0.9.0
django-smtp-ssl==1.0
django-storages==1.4.1
django-tastypie==0.13.3
django-transmeta==0.7.3
feedparser==5.1.2
#django-transmeta==0.7.3
git+git://github.com/resulto/django-transmeta.git@ad4d7278ba330dcf8c8446f8ae9b2c769ae8684e
fef-questionnaire==4.0.1
freebase==1.0.8
#gitenberg.metadata==0.1.6
git+https://github.com/gitenberg-dev/gitberg-build
#git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11
github3.py==0.9.5
html5lib==1.0b3
html5lib==1.0.1
httplib2==0.7.5
isodate==0.5.1
kombu==3.0.35
lxml==2.3.5
lxml==4.2.1
defusedxml==0.4.1
mechanize==0.2.5
mimeparse==0.1.3
@ -66,6 +59,7 @@ paramiko==1.14.1
postmonkey==1.0b
pycrypto==2.6
pymarc==3.0.2
pyoai==2.5.0
pyparsing==2.0.3
python-dateutil==2.5.3
python-mimeparse==0.1.4
@ -75,12 +69,12 @@ pytz==2016.6.1
rdflib==4.2.0
rdflib-jsonld==0.3
redis==2.10.3
reportlab==3.1.8
reportlab==3.4.0
requests==2.10.0
requests-mock==1.2.0
requests-oauthlib==0.6.2
selenium==2.53.1
six==1.9.0
six==1.11.0
sorl-thumbnail==12.3
ssh==1.7.14
stevedore==1.12.0
@ -89,7 +83,8 @@ virtualenv==1.4.9
# virtualenv-clone==0.2.4 not sure why I have this in my env
#virtualenvwrapper==3.6
wsgiref==0.1.2
xhtml2pdf==0.0.6
xhtml2pdf==0.2.2
webencodings==0.5.1
#for urllib3 secure
cffi==1.7.0
cryptography==2.1.4

View File

@ -165,7 +165,7 @@ INSTALLED_APPS = (
'social.apps.django_app.default',
'tastypie',
'djcelery',
'endless_pagination',
'el_pagination',
'selectable',
'regluit.frontend.templatetags',
'notification',

View File

@ -29,7 +29,9 @@ DATABASES = {
'PASSWORD': '',
'HOST': '',
'PORT': '',
'TEST_CHARSET': 'utf8',
'TEST': {
'CHARSET': 'utf8',
}
}
}

View File

@ -20,7 +20,9 @@ DATABASES = {
'PASSWORD': 'regluit',
'HOST': '',
'PORT': '',
'TEST_CHARSET': 'utf8',
'TEST': {
'CHARSET': 'utf8',
}
}
}

View File

@ -22,7 +22,9 @@ DATABASES = {
'PASSWORD': DATABASE_PASSWORD,
'HOST': DATABASE_HOST,
'PORT': '',
'TEST_CHARSET': 'utf8'
'TEST': {
'CHARSET': 'utf8',
}
}
}

View File

@ -21,7 +21,9 @@ DATABASES = {
'PASSWORD': DATABASE_PASSWORD,
'HOST': DATABASE_HOST,
'PORT': '',
'TEST_CHARSET': 'utf8',
'TEST': {
'CHARSET': 'utf8',
}
}
}

View File

@ -23,7 +23,9 @@ DATABASES = {
'PASSWORD': DATABASE_PASSWORD,
'HOST': DATABASE_HOST,
'PORT': '',
'TEST_CHARSET': 'utf8',
'TEST': {
'CHARSET': 'utf8',
}
}
}

View File

@ -1 +1 @@
import localdatetime

View File

@ -1,6 +1,10 @@
from django.conf.global_settings import LANGUAGES
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
code2lang = dict(LANGUAGES)
def get_language_code(language):
return lang2code.get(language.lower().strip(), '')
language = language.lower().strip()
if language in code2lang:
return language
return lang2code.get(language, '')

View File

@ -1,140 +1,8 @@
"""
Utility to return datetime.datetime.utcnow() by default but allows for a custom utcnow() (e.g., for testing)
from django.utils.timezone import now
>>> import regluit
>>> from regluit.utils.localdatetime import now
>>> now()
datetime.datetime(2012, 3, 8, 14, 0, 35, 409270)
>>> now()
datetime.datetime(2012, 3, 8, 14, 0, 36, 985271)
>>> n = now()
>>> n
datetime.datetime(2012, 3, 8, 14, 1, 54, 650679)
>>> regluit.utils.localdatetime._now = lambda: n
>>> now()
datetime.datetime(2012, 3, 8, 14, 1, 54, 650679)
>>> now()
datetime.datetime(2012, 3, 8, 14, 1, 54, 650679)
>>> now()
DST handled:
>>> ptz = pytz.timezone('America/Los_Angeles')
>>> make_naive(datetime.datetime(2012,03,11,10,tzinfo=utc), ptz)
datetime.datetime(2012, 3, 11, 3, 0)
>>> make_naive(datetime.datetime(2012,03,11,9,tzinfo=utc), ptz)
datetime.datetime(2012, 3, 11, 1, 0)
>>> make_aware(datetime.datetime(2012,11,4,1,30), ptz)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/Users/raymondyee/C/src/Gluejar/regluit/utils/localdatetime.py", line 90, in make_aware
return timezone.localize(value, is_dst=None)
File "/Users/raymondyee/.virtualenvs/regluit/lib/python2.7/site-packages/pytz/tzinfo.py", line 349, in localize
raise AmbiguousTimeError(dt)
AmbiguousTimeError: 2012-11-04 01:30:00
"""
import pytz
import datetime
import django
from django.conf import settings
# for Django 1.3.x, return a timestamp naive now()
# for Django 1.4 should switch to django.utils.timezone.now()
# see https://code.djangoproject.com/browser/django/trunk/django/utils/timezone.py?rev=17642#L232
def now():
if hasattr(settings, 'LOCALDATETIME_NOW') and settings.LOCALDATETIME_NOW is not None:
return settings.LOCALDATETIME_NOW()
else:
try:
return django.utils.timezone.now()
except AttributeError, e:
return datetime.datetime.now()
# provide a replacement for datetime.date.today()
# this will be timezone naive -- is that what we really want?
# switch to django.utils.timezone.localdate in django 1.11
def date_today():
return now().date()
# borrow a lot of the routines/code that will be in Django 1.4+ django.utils.timezone
# https://code.djangoproject.com/browser/django/trunk/django/utils/timezone.py
utc = pytz.utc
def get_default_timezone():
return pytz.timezone(settings.TIME_ZONE)
def is_aware(value):
"""
Determines if a given datetime.datetime is aware.
The logic is described in Python's docs:
http://docs.python.org/library/datetime.html#datetime.tzinfo
"""
return value.tzinfo is not None and value.tzinfo.utcoffset(value) is not None
def is_naive(value):
"""
Determines if a given datetime.datetime is naive.
The logic is described in Python's docs:
http://docs.python.org/library/datetime.html#datetime.tzinfo
"""
return value.tzinfo is None or value.tzinfo.utcoffset(value) is None
def make_aware(value, timezone):
"""
Makes a naive datetime.datetime in a given time zone aware.
"""
if hasattr(timezone, 'localize'):
# available for pytz time zones
return timezone.localize(value, is_dst=None)
else:
# may be wrong around DST changes
return value.replace(tzinfo=timezone)
def make_naive(value, timezone):
"""
Makes an aware datetime.datetime naive in a given time zone.
"""
value = value.astimezone(timezone)
if hasattr(timezone, 'normalize'):
# available for pytz time zones
value = timezone.normalize(value)
return value.replace(tzinfo=None)
def isoformat(value):
"""
if value is naive, assume it's in the default_timezone
"""
if is_naive(value):
return make_aware(value, get_default_timezone()).isoformat()
else:
return value.isoformat()
def zuluformat(value):
"""format value in zulu format -- e.g., 2012-03-26T17:47:22.654449Z"""
return "{0}Z".format(as_utc_naive(value).isoformat())
def as_utc_naive(value):
"""
if value is naive, assume it's in the default time zone, then convert to UTC but make naive
"""
if is_naive(value):
return make_naive(make_aware(value, get_default_timezone()), utc)
else:
return make_naive(value, utc)
def as_default_timezone_naive(value):
"""
if value is naive, assume it's in UTC and convert to the default tz and make it naive
"""
if is_naive(value):
return make_naive(make_aware(value, utc), get_default_timezone())
else:
return make_naive(value, get_default_timezone())