Merge remote-tracking branch 'Gluejar/master' into catchup

# Conflicts: # core/models/__init__.py # core/models/bibmodels.py # vagrant/host_vars/prod/secrets.yml
2018-05-23 13:56:27 -04:00 · 2018-05-23 13:56:27 -04:00 · 3661faec0a
parent ad7aff9664 71ba8dc9fa
commit 3661faec0a
78 changed files with 1551 additions and 55413 deletions
--- a/api/onix.py
+++ b/api/onix.py
@ -25,7 +25,7 @@ def onix_feed(facet, max=None):
        editions = facet.facet_object.filter_model("Edition",editions).distinct()
        for edition in editions:
            edition_prod = product(edition, facet.facet_object)
-            if edition_prod:
+            if edition_prod is not None:
                feed.append(edition_prod)    
    return etree.tostring(feed, pretty_print=True)
    
@ -34,7 +34,7 @@ def onix_feed_for_work(work):
    feed.append(header(work))
    for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct():
        edition_prod = product(edition)
-        if edition_prod:
+        if edition_prod is not None:
            feed.append(product(edition))
    return etree.tostring(feed, pretty_print=True)
    
--- a/api/tests.py
+++ b/api/tests.py
@ -10,6 +10,7 @@ django imports
 from django.contrib.auth.models import User
 from django.test import TestCase
 from django.test.client import Client
+from django.utils.timezone import now

 """
 regluit imports
@ -17,7 +18,6 @@ regluit imports
 import regluit.core.isbn

 from regluit.core import models
-from regluit.utils.localdatetime import now
 from regluit.api import models as apimodels

 class ApiTests(TestCase):
--- a/api/urls.py
+++ b/api/urls.py
@ -40,7 +40,7 @@ urlpatterns = [
    url(r"^onix/(?P<facet>.*)/$", OnixView.as_view(), name="onix"),
    url(r"^onix/$", OnixView.as_view(), name="onix_all"),
    url(r'^id/work/(?P<work_id>\w+)/$', negotiate_content, name="work_identifier"),
-    url(r'^loader/yaml$',load_yaml, name="load_yaml"),
-    url(r'^travisci/webhook$',travisci_webhook, name="travisci_webhook"),
+    url(r'^loader/yaml$', load_yaml, name="load_yaml"),
+    url(r'^travisci/webhook$', travisci_webhook, name="travisci_webhook"),
    url(r'^', include(v1_api.urls)),
 ]
--- a/bookdata/doab.json
+++ b/bookdata/doab.json
--- a/bookdata/doab_auths.json
+++ b/bookdata/doab_auths.json
--- a/booxtream/init.py
+++ b/booxtream/init.py
@ -5,25 +5,24 @@ from urllib import quote
 from functools import partial
 from xml.etree import ElementTree

+from django.apps import apps

 from . exceptions import BooXtreamError
-from . models import Boox
-

 class BooXtream(object):
    """ ``apikey``

-          The API key for your BooXtream account, obtained from BooXtream. Defaults to using 
+          The API key for your BooXtream account, obtained from BooXtream. Defaults to using
          settings.BOOXTREAM_API_KEY

        ``apiuser``

-          The username key for your BooXtream account, obtained from BooXtream. Defaults to using 
+          The username key for your BooXtream account, obtained from BooXtream. Defaults to using
          settings.BOOXTREAM_API_USER
-          
-          
+
+
        ``timeout``
-        
+
        passed to requests
    """
    def __init__(self,
@ -36,58 +35,60 @@ class BooXtream(object):
            apiuser = settings.BOOXTREAM_API_USER
        self.endpoint = 'https://service.booxtream.com/'
        self.postrequest = partial(requests.post, timeout=timeout, auth=(apiuser,apikey))
-        
+

    def platform(self,  epubfile=None, epub=True, kf8mobi=False,  **kwargs):
-        """ Make an API request to BooXtream 
+        """ Make an API request to BooXtream
        ``self.apikey``, ``epubfile`` and the supplied ``kwargs``.
        Attempts to deserialize the XML response and return the download link.

        Will raise ``BooXtreamError`` if BooXtream returns an exception
        code.
        """
-        url = self.endpoint + 'booxtream.xml' 
+        Boox = apps.get_model('booxtream', 'Boox')
+
+        url = self.endpoint + 'booxtream.xml'
        kwargs['epub'] =  '1' if epub else '0'
        kwargs['kf8mobi'] = '1' if kf8mobi else '0'
        if epubfile:
            if hasattr(epubfile,'name') and str(epubfile.name).endswith('.epub'):
                files= {'epubfile': (str(epubfile.name),epubfile)}
            else:
-                # give it a random file name so that kindlegen doesn't choke 
-                # needed for in-memory (StringIO) epubs 
+                # give it a random file name so that kindlegen doesn't choke
+                # needed for in-memory (StringIO) epubs
                files= {'epubfile': ('%012x.epub' % random.randrange(16**12),epubfile)}
        else:
-             files={}      
+             files={}
        if settings.LOCAL_TEST:
            # fake it, so you can test other functions without hitting booxtream
            boox = Boox.objects.create(
-                    download_link_epub='https://github.com/eshellman/42_ebook/blob/master/download/42.epub?raw=true&extra=download.booxtream.com/', 
-                    download_link_mobi='https://github.com/eshellman/42_ebook/blob/master/download/42.mobi?raw=true', 
-                    referenceid= kwargs.get('referenceid'),
-                    downloads_remaining= kwargs.get('downloadlimit'),
-                    expirydays=kwargs.get('expirydays'),
-                )
+                download_link_epub='https://github.com/eshellman/42_ebook/blob/master/download/42.epub?raw=true&extra=download.booxtream.com/',
+                download_link_mobi='https://github.com/eshellman/42_ebook/blob/master/download/42.mobi?raw=true',
+                referenceid= kwargs.get('referenceid'),
+                downloads_remaining= kwargs.get('downloadlimit'),
+                expirydays=kwargs.get('expirydays'),
+            )
            return boox

        resp = self.postrequest(url, data=kwargs, files=files)
        doc = ElementTree.fromstring(resp.content)

        # it turns out an Error can have an Error in it
-        errors = doc.findall('.//Response/Error') 
+        errors = doc.findall('.//Response/Error')
        if len(errors) > 0:
            raise BooXtreamError(errors)
        download_link_epub = doc.find('.//DownloadLink[@type="epub"]')
        if download_link_epub is not None:
-            download_link_epub = download_link_epub.text  
+            download_link_epub = download_link_epub.text
        download_link_mobi = doc.find('.//DownloadLink[@type="mobi"]')
        if download_link_mobi is not None:
-            download_link_mobi = download_link_mobi.text 
+            download_link_mobi = download_link_mobi.text
        boox = Boox.objects.create(
-                download_link_epub=download_link_epub, 
-                download_link_mobi=download_link_mobi, 
-                referenceid= kwargs.get('referenceid'),
-                downloads_remaining= kwargs.get('downloadlimit'),
-                expirydays=kwargs.get('expirydays'),
-            )
+            download_link_epub=download_link_epub,
+            download_link_mobi=download_link_mobi,
+            referenceid= kwargs.get('referenceid'),
+            downloads_remaining= kwargs.get('downloadlimit'),
+            expirydays=kwargs.get('expirydays'),
+        )
        return boox

--- a/core/apps.py
+++ b/core/apps.py
@ -1,11 +1,10 @@
 from django.apps import AppConfig
 from django.db.models.signals import post_migrate

-from regluit.core.signals import create_notice_types
-
 class CoreConfig(AppConfig):
    name = 'regluit.core'
    verbose_name = ' core objects'

    def ready(self):
+        from regluit.core.signals import create_notice_types
        post_migrate.connect(create_notice_types, sender=self)
--- a/core/bookloader.py
+++ b/core/bookloader.py
@ -23,6 +23,7 @@ from django_comments.models import Comment
 from github3 import (login, GitHub)
 from github3.repos.release import Release

+from django.utils.timezone import now
 from gitenberg.metadata.pandata import Pandata

 # regluit imports
@ -31,7 +32,6 @@ import regluit
 import regluit.core.isbn
 from regluit.core.validation import test_file
 from regluit.marc.models import inverse_marc_rels
-from regluit.utils.localdatetime import now

 from . import cc
 from . import models
@ -49,7 +49,7 @@ def add_by_oclc(isbn, work=None):

 def add_by_oclc_from_google(oclc):
    if oclc:
-        logger.info("adding book by oclc %s", oclc)
+        logger.info(u"adding book by oclc %s", oclc)
    else:
        return None
    try:
@ -59,10 +59,10 @@ def add_by_oclc_from_google(oclc):
        try:
            results = _get_json(url, {"q": '"OCLC%s"' % oclc})
        except LookupFailure, e:
-            logger.exception("lookup failure for %s", oclc)
+            logger.exception(u"lookup failure for %s", oclc)
            return None
        if not results.has_key('items') or not results['items']:
-            logger.warn("no google hits for %s", oclc)
+            logger.warn(u"no google hits for %s", oclc)
            return None

        try:
@ -70,16 +70,16 @@ def add_by_oclc_from_google(oclc):
            models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
            return e
        except LookupFailure, e:
-            logger.exception("failed to add edition for %s", oclc)
+            logger.exception(u"failed to add edition for %s", oclc)
        except IntegrityError, e:
-            logger.exception("google books data for %s didn't fit our db", oclc)
+            logger.exception(u"google books data for %s didn't fit our db", oclc)
        return None

 def valid_isbn(isbn):
    try:
        return identifier_cleaner('isbn')(isbn)
    except:
-        logger.exception("invalid isbn: %s", isbn)
+        logger.exception(u"invalid isbn: %s", isbn)
        return None

 def add_by_isbn(isbn, work=None, language='xx', title=''):
@ -88,13 +88,17 @@ def add_by_isbn(isbn, work=None, language='xx', title=''):
    try:
        e = add_by_isbn_from_google(isbn, work=work)
    except LookupFailure:
-        logger.exception("failed google lookup for %s", isbn)
+        logger.exception(u"failed google lookup for %s", isbn)
        # try again some other time
        return None
    if e:
+        if e.work.language == 'xx' and language != 'xx':
+            e.work.language == language
+            e.work.save()
+            logger.info('changed language for {} to {}'.format(isbn, language))
        return e

-    logger.info("null came back from add_by_isbn_from_google: %s", isbn)
+    logger.info(u"null came back from add_by_isbn_from_google: %s", isbn)

    # if there's a a title, we want to create stub editions and
    # works, even if google doesn't know about it # but if it's not valid,
@ -129,10 +133,10 @@ def get_google_isbn_results(isbn):
    try:
        results = _get_json(url, {"q": "isbn:%s" % isbn})
    except LookupFailure:
-        logger.exception("lookup failure for %s", isbn)
+        logger.exception(u"lookup failure for %s", isbn)
        return None
    if not results.has_key('items') or not results['items']:
-        logger.warn("no google hits for %s", isbn)
+        logger.warn(u"no google hits for %s", isbn)
        return None
    return results

@ -201,7 +205,7 @@ def update_edition(edition):
    # if the language of the edition no longer matches that of the parent work,
    # attach edition to the
    if edition.work.language != language:
-        logger.info("reconnecting %s since it is %s instead of %s",
+        logger.info(u"reconnecting %s since it is %s instead of %s",
            googlebooks_id, language, edition.work.language)
        old_work = edition.work

@ -210,7 +214,7 @@ def update_edition(edition):
        edition.work = new_work
        edition.save()
        for identifier in edition.identifiers.all():
-            logger.info("moving identifier %s", identifier.value)
+            logger.info(u"moving identifier %s", identifier.value)
            identifier.work = new_work
            identifier.save()
        if old_work and old_work.editions.count() == 0:
@ -256,7 +260,7 @@ def add_by_isbn_from_google(isbn, work=None):
        edition.new = False
        return edition

-    logger.info("adding new book by isbn %s", isbn)
+    logger.info(u"adding new book by isbn %s", isbn)
    results = get_google_isbn_results(isbn)
    if results:
        try:
@ -267,9 +271,9 @@ def add_by_isbn_from_google(isbn, work=None):
                isbn=isbn
            )
        except LookupFailure, e:
-            logger.exception("failed to add edition for %s", isbn)
+            logger.exception(u"failed to add edition for %s", isbn)
        except IntegrityError, e:
-            logger.exception("google books data for %s didn't fit our db", isbn)
+            logger.exception(u"google books data for %s didn't fit our db", isbn)
        return None
    return None

@ -320,7 +324,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
    if results:
        item = results
    else:
-        logger.info("loading metadata from google for %s", googlebooks_id)
+        logger.info(u"loading metadata from google for %s", googlebooks_id)
        url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
        item = _get_json(url)
    d = item['volumeInfo']
@ -343,7 +347,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
    if len(language) > 5:
        language = language[0:5]
    if work and work.language != language:
-        logger.info("not connecting %s since it is %s instead of %s",
+        logger.info(u"not connecting %s since it is %s instead of %s",
                    googlebooks_id, language, work.language)
        work = None
    # isbn = None
@ -371,7 +375,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
    try:
        e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
        e.new = False
-        logger.warning(" whoa nellie, somebody else created an edition while we were working.")
+        logger.warning(u" whoa nellie, somebody else created an edition while we were working.")
        if work.new:
            work.delete()
        return e
@ -404,19 +408,19 @@ def relate_isbn(isbn, cluster_size=1):
    """add a book by isbn and then see if there's an existing work to add it to so as to make a
    cluster bigger than cluster_size.
    """
-    logger.info("finding a related work for %s", isbn)
+    logger.info(u"finding a related work for %s", isbn)

    edition = add_by_isbn(isbn)
    if edition is None:
        return None
    if edition.work is None:
-        logger.info("didn't add related to null work")
+        logger.info(u"didn't add related to null work")
        return None
    if edition.work.editions.count() > cluster_size:
        return edition.work
    for other_isbn in thingisbn(isbn):
        # 979's come back as 13
-        logger.debug("other_isbn: %s", other_isbn)
+        logger.debug(u"other_isbn: %s", other_isbn)
        if len(other_isbn) == 10:
            other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
        related_edition = add_by_isbn(other_isbn, work=edition.work)
@ -427,7 +431,7 @@ def relate_isbn(isbn, cluster_size=1):
                    related_edition.work = edition.work
                    related_edition.save()
                elif related_edition.work_id != edition.work_id:
-                    logger.debug("merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
+                    logger.debug(u"merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
                    merge_works(related_edition.work, edition.work)
                if related_edition.work.editions.count() > cluster_size:
                    return related_edition.work
@ -438,7 +442,7 @@ def add_related(isbn):
    The initial seed ISBN will be added if it's not already there.
    """
    # make sure the seed edition is there
-    logger.info("adding related editions for %s", isbn)
+    logger.info(u"adding related editions for %s", isbn)

    new_editions = []

@ -446,14 +450,14 @@ def add_related(isbn):
    if edition is None:
        return new_editions
    if edition.work is None:
-        logger.warning("didn't add related to null work")
+        logger.warning(u"didn't add related to null work")
        return new_editions
    # this is the work everything will hang off
    work = edition.work
    other_editions = {}
    for other_isbn in thingisbn(isbn):
        # 979's come back as 13
-        logger.debug("other_isbn: %s", other_isbn)
+        logger.debug(u"other_isbn: %s", other_isbn)
        if len(other_isbn) == 10:
            other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
        related_edition = add_by_isbn(other_isbn, work=work)
@ -466,7 +470,7 @@ def add_related(isbn):
                    related_edition.work = work
                    related_edition.save()
                elif related_edition.work_id != work.id:
-                    logger.debug("merge_works path 1 %s %s", work.id, related_edition.work_id)
+                    logger.debug(u"merge_works path 1 %s %s", work.id, related_edition.work_id)
                    work = merge_works(work, related_edition.work)
            else:
                if other_editions.has_key(related_language):
@ -476,14 +480,14 @@ def add_related(isbn):

    # group the other language editions together
    for lang_group in other_editions.itervalues():
-        logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
+        logger.debug(u"lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
        if len(lang_group) > 1:
            lang_edition = lang_group[0]
-            logger.debug("lang_edition.id: %s", lang_edition.id)
+            logger.debug(u"lang_edition.id: %s", lang_edition.id)
            # compute the distinct set of works to merge into lang_edition.work
            works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
            for w in works_to_merge:
-                logger.debug("merge_works path 2 %s %s", lang_edition.work_id, w.id)
+                logger.debug(u"merge_works path 2 %s %s", lang_edition.work_id, w.id)
                merged_work = merge_works(lang_edition.work, w)
        models.WorkRelation.objects.get_or_create(
            to_work=lang_group[0].work,
@ -498,17 +502,21 @@ def thingisbn(isbn):
    Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns,
    which come back as isbn_13')
    """
-    logger.info("looking up %s at ThingISBN", isbn)
+    logger.info(u"looking up %s at ThingISBN", isbn)
    url = "https://www.librarything.com/api/thingISBN/%s" % isbn
    xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
-    doc = ElementTree.fromstring(xml)
-    return [e.text for e in doc.findall('isbn')]
+    try:
+        doc = ElementTree.fromstring(xml)
+        return [e.text for e in doc.findall('isbn')]
+    except SyntaxError:
+        # LibraryThing down
+        return []


 def merge_works(w1, w2, user=None):
    """will merge the second work (w2) into the first (w1)
    """
-    logger.info("merging work %s into %s", w2.id, w1.id)
+    logger.info(u"merging work %s into %s", w2.id, w1.id)
    # don't merge if the works are the same or at least one of the works has no id
    #(for example, when w2 has already been deleted)
    if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
@ -583,7 +591,7 @@ def detach_edition(e):
    will detach edition from its work, creating a new stub work. if remerge=true, will see if
    there's another work to attach to
    """
-    logger.info("splitting edition %s from %s", e, e.work)
+    logger.info(u"splitting edition %s from %s", e, e.work)
    w = models.Work(title=e.title, language=e.work.language)
    w.save()

@ -618,7 +626,7 @@ def add_openlibrary(work, hard_refresh=False):
    work.save()

    # find the first ISBN match in OpenLibrary
-    logger.info("looking up openlibrary data for work %s", work.id)
+    logger.info(u"looking up openlibrary data for work %s", work.id)

    e = None # openlibrary edition json
    w = None # openlibrary work json
@ -633,7 +641,7 @@ def add_openlibrary(work, hard_refresh=False):
        try:
            e = _get_json(url, params, type='ol')
        except LookupFailure:
-            logger.exception("OL lookup failed for  %s", isbn_key)
+            logger.exception(u"OL lookup failed for  %s", isbn_key)
            e = {}
        if e.has_key(isbn_key):
            if e[isbn_key].has_key('details'):
@ -673,7 +681,7 @@ def add_openlibrary(work, hard_refresh=False):
                        )
                if e[isbn_key]['details'].has_key('works'):
                    work_key = e[isbn_key]['details']['works'].pop(0)['key']
-                    logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
+                    logger.info(u"got openlibrary work %s for isbn %s", work_key, isbn_key)
                    models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
                    try:
                        w = _get_json("https://openlibrary.org" + work_key, type='ol')
@ -691,14 +699,14 @@ def add_openlibrary(work, hard_refresh=False):
                        if w.has_key('subjects') and len(w['subjects']) > len(subjects):
                            subjects = w['subjects']
                    except LookupFailure:
-                        logger.exception("OL lookup failed for  %s", work_key)
+                        logger.exception(u"OL lookup failed for  %s", work_key)
    if not subjects:
-        logger.warn("unable to find work %s at openlibrary", work.id)
+        logger.warn(u"unable to find work %s at openlibrary", work.id)
        return

    # add the subjects to the Work
    for s in subjects:
-        logger.info("adding subject %s to work %s", s, work.id)
+        logger.info(u"adding subject %s to work %s", s, work.id)
        subject = models.Subject.set_by_name(s, work=work)

    work.save()
@ -716,9 +724,9 @@ def _get_json(url, params={}, type='gb'):
    if response.status_code == 200:
        return json.loads(response.content)
    else:
-        logger.error("unexpected HTTP response: %s", response)
+        logger.error(u"unexpected HTTP response: %s", response)
        if response.content:
-            logger.error("response content: %s", response.content)
+            logger.error(u"response content: %s", response.content)
        raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))


@ -766,7 +774,7 @@ def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url
        ebook = models.Ebook()

    if len(ebooks) > 1:
-        logger.warning("There is more than one Ebook matching url {0}".format(url))
+        logger.warning(u"There is more than one Ebook matching url {0}".format(url))


    ebook.format = format
@ -826,8 +834,6 @@ def edition_for_etype(etype, metadata, default=None):
        for key in metadata.edition_identifiers.keys():
            return edition_for_ident(key, metadata.identifiers[key])

-MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
-
 def load_ebookfile(url, etype):
    '''
    return a ContentFile if a new ebook has been loaded
@ -960,8 +966,7 @@ class BasePandataLoader(object):
                    if contentfile:
                        contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
                        path = default_storage.save(contentfile_name, contentfile)
-                        lic = MATCH_LICENSE.search(metadata.rights_url)
-                        license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
+                        license = cc.license_from_cc_url(metadata.rights_url)
                        ebf = models.EbookFile.objects.create(
                            format=key,
                            edition=edition,
--- a/core/cc.py
+++ b/core/cc.py
@ -1,8 +1,11 @@
 # coding=utf-8
-# mostly constants related to Creative Commons
+''' mostly constants related to Creative Commons
 # let's be DRY with these parameters

 ## need to add versioned CC  entries
+'''
+
+import re

 INFO_CC = (
    ('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'),     
@ -162,3 +165,15 @@ def match_license(license_string):
    except ValueError:
        pass
    return RIGHTS_ALIAS.get(license_string, None)
+
+MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
+def license_from_cc_url(rights_url):
+    if not rights_url:
+        return None
+    lic = MATCH_LICENSE.search(rights_url)
+    if lic:
+        return 'CC {}'.format(lic.group(1).upper())
+    if rights_url.find('openedition.org') >= 0:
+        return 'OPENEDITION'
+    return ''
+
--- a/core/isbn.py
+++ b/core/isbn.py
@ -45,10 +45,10 @@ def convert_10_to_13(isbn):
    except:
        return None

-ISBN_REGEX = re.compile(r'^(\d{9}|\d{12})(\d|X)$')
-DASH_REGEX = re.compile(r'[ \-–—]+')
+ISBN_REGEX = re.compile(r'^(\d{9}[\dX]|\d{13})$')
+DASH_REGEX = re.compile(u'[ \\-–—‐,;]+')  #includes unicode hyphen, endash and emdash
 def strip(s):
-    """Strips away any - or spaces.  If the remaining string is of length 10 or 13
+    """Strips away any - or spaces and some punctuation.  If the remaining string is of length 10 or 13
    with digits only in anything but the last
    check digit (which may be X), then return '' -- otherwise return the remaining string
    """
--- a/core/librarything.py
+++ b/core/librarything.py
@ -2,11 +2,12 @@ import csv
 import HTMLParser
 import httplib
 import logging
-import mechanize
 import re
+from datetime import datetime
+
+import mechanize
 import requests

-from datetime import datetime
 from regluit.core import models

 logger = logging.getLogger(__name__)
@ -20,7 +21,7 @@ class LibraryThing(object):
    """
    url = "https://www.librarything.com"
    csv_file_url = "https://www.librarything.com/export-csv"
-    
+
    def __init__(self, username=None, password=None):
        self.username = username
        self.password = password
@ -40,77 +41,98 @@ class LibraryThing(object):
    def parse_csv(self):
        h = HTMLParser.HTMLParser()
        reader = csv.DictReader(self.csv_handle)
-        # There are more fields to be parsed out.  Note that there is a second author column to handle
-        for (i,row) in enumerate(reader):
+        # There are more fields to be parsed out.  Note that there is a
+        # second author column to handle
+        for (i, row) in enumerate(reader):
            # ISBNs are written like '[123456789x]' in the CSV, suggesting possibility of a list
            m = re.match(r'^\[(.*)\]$', row["'ISBNs'"])
            if m:
                isbn = m.group(1).split()
            else:
                isbn = []
-            yield {'title':h.unescape(row["'TITLE'"]), 'author':h.unescape(row["'AUTHOR (first, last)'"]),
-                   'isbn':isbn, 'comment':row["'COMMENT'"],
-                   'tags':row["'TAGS'"], 'collections':row["'COLLECTIONS'"],
-                    'reviews':h.unescape(row["'REVIEWS'"])}
+            yield {
+                'title':h.unescape(row["'TITLE'"]),
+                'author':h.unescape(row["'AUTHOR (first, last)'"]),
+                'isbn':isbn,
+                'comment':row["'COMMENT'"],
+                'tags':row["'TAGS'"],
+                'collections':row["'COLLECTIONS'"],
+                'reviews':h.unescape(row["'REVIEWS'"])
+            }
    def viewstyle_1(self, rows):
-        
-        for (i,row) in enumerate(rows):
+
+        for (i, row) in enumerate(rows):
            book_data = {}
            cols = row.xpath('td')
            # cover
-            book_data["cover"] = {"cover_id":cols[0].attrib["id"],
-                                  "image": {"width":cols[0].xpath('.//img')[0].attrib['width'],
-                                    "src": cols[0].xpath('.//img')[0].attrib['src']}
+            book_data["cover"] = {
+                "cover_id":cols[0].attrib["id"],
+                "image": {
+                    "width":cols[0].xpath('.//img')[0].attrib['width'],
+                    "src": cols[0].xpath('.//img')[0].attrib['src']
+                }
            }
            # title
-            book_data["title"] = {"href":cols[1].xpath('.//a')[0].attrib['href'],
-                                  "title":cols[1].xpath('.//a')[0].text}
-            
+            book_data["title"] = {
+                "href":cols[1].xpath('.//a')[0].attrib['href'],
+                "title":cols[1].xpath('.//a')[0].text
+            }
+
            # extract work_id and book_id from href
            try:
-                (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
+                (book_data["work_id"], book_data["book_id"]) = re.match(
+                    "^/work/(.*)/book/(.*)$",
+                    book_data["title"]["href"]
+                ).groups()
            except:
                (book_data["work_id"], book_data["book_id"]) = (None, None)
-                
+
            # author -- what if there is more than 1?  or none?
            try:
-                book_data["author"] = {"display_name":cols[2].xpath('.//a')[0].text,
-                                       "href":cols[2].xpath('.//a')[0].attrib['href'],
-                                       "name":cols[2].xpath('div')[0].text}
+                book_data["author"] = {
+                    "display_name":cols[2].xpath('.//a')[0].text,
+                    "href":cols[2].xpath('.//a')[0].attrib['href'],
+                    "name":cols[2].xpath('div')[0].text
+                }
            except:
                book_data["author"] = None
-                
+
            # date
            book_data["date"] = cols[3].xpath('span')[0].text
-            
+
            # tags: grab tags that are not empty strings
            tag_links = cols[4].xpath('.//a')
            book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
-            
+
            # rating -- count # of stars
            book_data["rating"] = len(cols[5].xpath('.//img[@alt="*"]'))
-            
+
            # entry date
-            book_data["entry_date"] = datetime.date(datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y"))
-            
+            book_data["entry_date"] = datetime.date(
+                datetime.strptime(cols[6].xpath('span')[0].text, "%b %d, %Y")
+            )
+
            yield book_data
-            
+
    def viewstyle_5(self, rows):
        # implement this view to get at the ISBNs
-        for (i,row) in enumerate(rows):
+        for (i, row) in enumerate(rows):
            book_data = {}
            cols = row.xpath('td')
-            
+
            # title
            book_data["title"] = {"href":cols[0].xpath('.//a')[0].attrib['href'],
                                  "title":cols[0].xpath('.//a')[0].text}
-            
+
            # extract work_id and book_id from href
            try:
-                (book_data["work_id"], book_data["book_id"]) = re.match("^/work/(.*)/book/(.*)$",book_data["title"]["href"]).groups()
+                (book_data["work_id"], book_data["book_id"]) = re.match(
+                    "^/work/(.*)/book/(.*)$",
+                    book_data["title"]["href"]
+                ).groups()
            except:
                (book_data["work_id"], book_data["book_id"]) = (None, None)
-            
+
            # tags
            tag_links = cols[1].xpath('.//a')
            book_data["tags"] = filter(lambda x: x is not None, [a.text for a in tag_links])
@ -121,13 +143,13 @@ class LibraryThing(object):
            except Exception, e:
                logger.info("no lc call number for: %s %s", book_data["title"], e)
                book_data["lc_call_number"] = None
-                
+
            # subject
-            
+
            subjects = cols[3].xpath('.//div[@class="subjectLine"]')
            book_data["subjects"] = [{'href':s.xpath('a')[0].attrib['href'],
                                      'text':s.xpath('a')[0].text} for s in subjects]
-            
+
            # isbn
            try:
                book_data["isbn"] = cols[4].xpath('.//span')[0].text
@ -136,90 +158,94 @@ class LibraryThing(object):
                    book_data["isbn"] = None
            except Exception, e:
                book_data["isbn"] = None
-            
+
            yield book_data

-        
+
    def parse_user_catalog(self, view_style=1):
        from lxml import html
-        
+
        # we can vary viewstyle to get different info
-        
-        IMPLEMENTED_STYLES = [1,5]
+
+        IMPLEMENTED_STYLES = [1, 5]
        COLLECTION = 2 # set to get All Collections
-        
+
        if view_style not in IMPLEMENTED_STYLES:
            raise NotImplementedError()
-        style_parser = getattr(self,"viewstyle_%s" % view_style)
+        style_parser = getattr(self, "viewstyle_%s" % view_style)
        next_page = True
        offset = 0
        cookies = None
-                
+
        # go to the front page of LibraryThing first to pick up relevant session-like cookies
        r = requests.get("https://www.librarything.com/")
        cookies = r.cookies
-        
+
        while next_page:
-            url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (self.username,
-                                        view_style, COLLECTION, offset)
+            url = "https://www.librarything.com/catalog_bottom.php?view=%s&viewstyle=%d&collection=%d&offset=%d" % (
+                self.username, view_style, COLLECTION, offset
+            )
            logger.info("url: %s", url)
            if cookies is None:
                r = requests.get(url)
            else:
                r = requests.get(url, cookies=cookies)
-                
+
            if r.status_code != httplib.OK:
-                raise LibraryThingException("Error accessing %s: %s" % (url, e))
-                logger.info("Error accessing %s: %s", url, e)
+                raise LibraryThingException("Error accessing %s: status %s" % (url, r.status_code))
            etree = html.fromstring(r.content)
-            #logger.info("r.content %s", r.content)
            cookies = r.cookies  # retain the cookies
-            
+
            # look for a page bar
            # try to grab the total number of books
            # 1 - 50 of 82
            try:
                count_text = etree.xpath('//td[@class="pbGroup"]')[0].text
-                total = int(re.search(r'(\d+)$',count_text).group(1))
+                total = int(re.search(r'(\d+)$', count_text).group(1))
                logger.info('total: %d', total)
-            except Exception, e:  # assume for now that if we can't grab this text, there is no page bar and no books
+            except Exception, e:
+                # assume for now that if we can't grab this text,
+                # there is no page bar and no books
                logger.info('Exception {0}'.format(e))
                total = 0
-                
-            # to do paging we can either look for a next link or just increase the offset by the number of rows.
+
+            # to do paging we can either look for a next link or just increase the offset
+            # by the number of rows.
            # Let's try the latter
            # possible_next_link = etree.xpath('//a[@class="pageShuttleButton"]')[0]
-                        
+
            rows_xpath = '//table[@id="lt_catalog_list"]/tbody/tr'
-        
+
            # deal with page 1 first and then working on paging through the collection
            rows = etree.xpath(rows_xpath)
-            
-            i = -1 # have to account for the problem of style_parser(rows) returning nothing
-        
-            for (i,row) in enumerate(style_parser(rows)):
-                yield row
-                
-            # page size = 50, first page offset = 0, second page offset = 50 -- if total = 50 no need to go

-            offset += i + 1  
+            i = -1 # have to account for the problem of style_parser(rows) returning nothing
+
+            for (i, row) in enumerate(style_parser(rows)):
+                yield row
+
+            # page size = 50, first page offset = 0, second page offset = 50
+            # -- if total = 50 no need to go
+
+            offset += i + 1
            if offset >= total:
                next_page = False

 def load_librarything_into_wishlist(user, lt_username, max_books=None):
    """
-    Load a specified LibraryThing shelf (by default:  all the books from the LibraryThing account associated with user)
+    Load a specified LibraryThing shelf (by default:  all the books
+    from the LibraryThing account associated with user)
    """
-   
+
    from regluit.core import bookloader
    from regluit.core import tasks
    from itertools import islice
-    
+
    logger.info("Entering into load_librarything_into_wishlist")
    lt = LibraryThing(lt_username)
-    
-    
-    for (i,book) in enumerate(islice(lt.parse_user_catalog(view_style=5),max_books)):
+
+
+    for (i, book) in enumerate(islice(lt.parse_user_catalog(view_style=5), max_books)):
        isbn = book["isbn"]  # grab the first one
        logger.info("%d %s %s", i, book["title"]["title"], isbn)
        try:
@ -229,13 +255,27 @@ def load_librarything_into_wishlist(user, lt_username, max_books=None):
            if not edition:
                continue
            # add the librarything ids to the db since we know them now
-            identifier= models.Identifier.get_or_add(type = 'thng', value = book['book_id'], edition = edition, work = edition.work)
-            identifier= models.Identifier.get_or_add(type = 'ltwk', value = book['work_id'], work = edition.work)
+            identifier = models.Identifier.get_or_add(
+                type='thng',
+                value=book['book_id'],
+                edition=edition,
+                work=edition.work
+            )
+            identifier = models.Identifier.get_or_add(
+                type='ltwk',
+                value=book['work_id'],
+                work=edition.work
+            )
            if book['lc_call_number']:
-                identifier= models.Identifier.get_or_add(type = 'lccn', value = book['lc_call_number'], edition = edition, work = edition.work)
+                identifier = models.Identifier.get_or_add(
+                    type='lccn',
+                    value=book['lc_call_number'],
+                    edition=edition,
+                    work=edition.work
+                )
            user.wishlist.add_work(edition.work, 'librarything', notify=True)
            if edition.new:
                tasks.populate_edition.delay(edition.isbn_13)
            logger.info("Work with isbn %s added to wishlist.", isbn)
        except Exception, e:
-            logger.info ("error adding ISBN %s: %s", isbn, e)             
+            logger.info("error adding ISBN %s: %s", isbn, e)
--- a/core/loaders/init.py
+++ b/core/loaders/init.py
@ -16,10 +16,10 @@ from .smashwords import SmashwordsScraper
 def get_scraper(url):
    scrapers = [
        PressbooksScraper,
-        HathitrustScraper,
        SpringerScraper,
        UbiquityScraper,
        SmashwordsScraper,
+        HathitrustScraper,
        BaseScraper,
    ]
    for scraper in scrapers:
@ -52,3 +52,9 @@ def add_by_webpage(url, work=None, user=None):
        
 def add_by_sitemap(url, maxnum=None):
    return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
+    
+def scrape_language(url):
+    scraper = get_scraper(url)
+    return scraper.metadata.get('language')
+
+
--- a/core/loaders/doab.py
+++ b/core/loaders/doab.py
@ -1,42 +1,54 @@
 #!/usr/bin/env python
 # encoding: utf-8
-import logging
+import datetime
 import json
+import logging
 import re

-from itertools import islice
-
 import requests

-from django.db.models import (Q, F)
+from django.db.models import Q

-from django.core.files.storage import default_storage
 from django.core.files.base import ContentFile
+from django.core.files.storage import default_storage

-import regluit
+from oaipmh.client import Client
+from oaipmh.error import IdDoesNotExistError
+from oaipmh.metadata import MetadataRegistry, oai_dc_reader
+
+from regluit.core import bookloader, cc
 from regluit.core import models, tasks
-from regluit.core import bookloader
-from regluit.core.bookloader import add_by_isbn, merge_works
+from regluit.core.bookloader import merge_works
 from regluit.core.isbn import ISBN
-from regluit.core.validation import valid_subject
+from regluit.core.loaders.utils import type_for_url
+from regluit.core.validation import identifier_cleaner, valid_subject
+
+from . import scrape_language
+from .doab_utils import doab_lang_to_iso_639_1, online_to_download, url_to_provider

 logger = logging.getLogger(__name__)

-springercover = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
+def unlist(alist):
+    if not alist:
+        return None
+    return alist[0]

+
+SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
+SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg'
 def store_doab_cover(doab_id, redo=False):
-    
+
    """
    returns tuple: 1) cover URL, 2) whether newly created (boolean)
    """
-    
-    cover_file_name= '/doab/%s/cover' % (doab_id)
-    
+
+    cover_file_name = '/doab/%s/cover' % (doab_id)
+
    # if we don't want to redo and the cover exists, return the URL of the cover
-    
+
    if not redo and default_storage.exists(cover_file_name):
        return (default_storage.url(cover_file_name), False)
-        
+
    # download cover image to cover_file
    url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
    try:
@ -44,29 +56,31 @@ def store_doab_cover(doab_id, redo=False):
        if r.status_code == 302:
            redirurl = r.headers['Location']
            if redirurl.startswith(u'ftp'):
-                springerftp = springercover.match(redirurl)
+                springerftp = SPRINGER_COVER.match(redirurl)
                if springerftp:
-                    redirurl = u'https://images.springer.com/sgw/books/medium/{}.jpg'.format(springerftp.groups(1))
+                    redirurl = SPRINGER_IMAGE.format(springerftp.groups(1))
                    r = requests.get(redirurl)
+            else:
+                r = requests.get(url)
        else:
-            r = requests.get(url)    
+            r = requests.get(url)
        cover_file = ContentFile(r.content)
        cover_file.content_type = r.headers.get('content-type', '')

-        path = default_storage.save(cover_file_name, cover_file)    
+        default_storage.save(cover_file_name, cover_file)
        return (default_storage.url(cover_file_name), True)
    except Exception, e:
        # if there is a problem, return None for cover URL
        logger.warning('Failed to make cover image for doab_id={}: {}'.format(doab_id, e))
        return (None, False)

-def update_cover_doab(doab_id, edition, store_cover=True):
+def update_cover_doab(doab_id, edition, store_cover=True, redo=True):
    """
    update the cover url for work with doab_id
    if store_cover is True, use the cover from our own storage
    """
    if store_cover:
-        (cover_url, new_cover) = store_doab_cover(doab_id)
+        (cover_url, new_cover) = store_doab_cover(doab_id, redo=redo)
    else:
        cover_url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)

@ -74,131 +88,133 @@ def update_cover_doab(doab_id, edition, store_cover=True):
        edition.cover_image = cover_url
        edition.save()
        return cover_url
-    else:
-        return None
-    
+    return None
+
 def attach_more_doab_metadata(edition, description, subjects,
                              publication_date, publisher_name=None, language=None, authors=u''):
-    
+
    """
    for given edition, attach description, subjects, publication date to
    corresponding Edition and Work
    """
-    # if edition doesn't have a publication date, update it    
+    # if edition doesn't have a publication date, update it
    if not edition.publication_date:
        edition.publication_date = publication_date
-    
+
    # if edition.publisher_name is empty, set it
    if not edition.publisher_name:
        edition.set_publisher(publisher_name)
-        
+
    edition.save()
-        
+
    # attach description to work if it's not empty
    work = edition.work
    if not work.description:
        work.description = description
-        
+
    # update subjects
    for s in subjects:
        if valid_subject(s):
            models.Subject.set_by_name(s, work=work)
-    
+
    # set reading level of work if it's empty; doab is for adults.
    if not work.age_level:
        work.age_level = '18-'
-        
-    if language:
+
+    if language and language != 'xx':
        work.language = language
    work.save()
-    
+
    if authors and authors == authors: # test for authors != NaN
        authlist = creator_list(authors)
        if edition.authors.all().count() < len(authlist):
            edition.authors.clear()
            if authlist is not None:
-                for [rel,auth] in authlist:
+                for [rel, auth] in authlist:
                    edition.add_author(auth, rel)
-               
+
    return edition

 def add_all_isbns(isbns, work, language=None, title=None):
    first_edition = None
    for isbn in isbns:
-        first_edition = None
        edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
        if edition:
-            first_edition = first_edition if first_edition else edition 
-            if work and (edition.work_id != work.id): 
+            first_edition = first_edition if first_edition else edition
+            if work and (edition.work_id != work.id):
                if work.created < edition.work.created:
                    work = merge_works(work, edition.work)
                else:
                    work = merge_works(edition.work, work)
            else:
                work = edition.work
-    return first_edition 
+    return work, first_edition

 def load_doab_edition(title, doab_id, url, format, rights,
                      language, isbns,
                      provider, **kwargs):
-    
+
    """
    load a record from doabooks.org represented by input parameters and return an ebook
    """
+    logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider))
    if language and isinstance(language, list):
        language = language[0]
-        
+    if language == 'xx' and format == 'online':
+        language = scrape_language(url)
    # check to see whether the Edition hasn't already been loaded first
    # search by url
    ebooks = models.Ebook.objects.filter(url=url)
-       
+
    # 1 match
    # > 1 matches
    # 0 match

    # simplest case -- if match (1 or more), we could check whether any
    # ebook.edition.work has a doab id matching given doab_id
-    
+
    # put a migration to force Ebook.url to be unique id
-    
+
    # if yes, then return one of the Edition(s) whose work is doab_id
-    # if no, then 
+    # if no, then
    ebook = None
    if len(ebooks) > 1:
-        raise Exception("There is more than one Ebook matching url {0}".format(url))    
-    elif len(ebooks) == 1:  
+        raise Exception("There is more than one Ebook matching url {0}".format(url))
+    elif len(ebooks) == 1:
        ebook = ebooks[0]
-        doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, 
-                                               work=ebook.edition.work)
-        # update the cover id 
-        cover_url = update_cover_doab(doab_id, ebook.edition)
+        doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
+                                                      work=ebook.edition.work)
+        if not ebook.rights:
+            ebook.rights = rights
+            ebook.save()
        
+        # update the cover id
+        cover_url = update_cover_doab(doab_id, ebook.edition, redo=False)
+
        # attach more metadata
-        attach_more_doab_metadata(ebook.edition, 
-                                  description=kwargs.get('description'),
-                                  subjects=kwargs.get('subject'),
-                                  publication_date=kwargs.get('date'),
-                                  publisher_name=kwargs.get('publisher'),
-                                  language=language,
-                                  authors=kwargs.get('authors'),)
+        attach_more_doab_metadata(
+            ebook.edition,
+            description=unlist(kwargs.get('description')),
+            subjects=kwargs.get('subject'),
+            publication_date=unlist(kwargs.get('date')),
+            publisher_name=unlist(kwargs.get('publisher')),
+            language=language,
+            authors=kwargs.get('creator'),
+        )
        # make sure all isbns are added
-        add_all_isbns(isbns, None, language=language, title=title)
-        return ebook
-    
+        add_all_isbns(isbns, ebook.edition.work, language=language, title=title)
+        return ebook.edition
+
    # remaining case --> no ebook, load record, create ebook if there is one.
-    assert len(ebooks) == 0
-            
+    assert not ebooks
+

    # we need to find the right Edition/Work to tie Ebook to...
-        
+
    # look for the Edition with which to associate ebook.
    # loop through the isbns to see whether we get one that is not None
-    work = None
-    edition = add_all_isbns(isbns, None, language=language, title=title)
-    if edition:
-        edition.refresh_from_db()
-        work = edition.work

+    work, edition = add_all_isbns(isbns, None, language=language, title=title)
    if doab_id and not work:
        # make sure there's not already a doab_id
        idents = models.Identifier.objects.filter(type='doab', value=doab_id)
@ -206,16 +222,17 @@ def load_doab_edition(title, doab_id, url, format, rights,
            edition = ident.work.preferred_edition
            work = edition.work
            break
-    
-    if edition is not None:
-        # if this is a new edition, then add related editions asynchronously
-        if getattr(edition,'new', False):
-            tasks.populate_edition.delay(edition.isbn_13)
-        doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
-                                work=edition.work)

-    # we need to create Edition(s) de novo    
-    else: 
+    if edition is not None:
+        # if this is a new edition, then add related editions SYNCHRONOUSLY
+        if getattr(edition, 'new', False):
+            tasks.populate_edition(edition.isbn_13)
+        edition.refresh_from_db()
+        doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
+                                                      work=edition.work)
+
+    # we need to create Edition(s) de novo
+    else:
        # if there is a Work with doab_id already, attach any new Edition(s)
        try:
            work = models.Identifier.objects.get(type='doab', value=doab_id).work
@ -226,11 +243,11 @@ def load_doab_edition(title, doab_id, url, format, rights,
                work = models.Work(language='xx', title=title, age_level='18-')
            work.save()
            doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
-                                               work=work)
-            
+                                                          work=work)
+
        # if work has any ebooks already, attach the ebook to the corresponding edition
        # otherwise pick the first one
-        # pick the first edition as the one to tie ebook to 
+        # pick the first edition as the one to tie ebook to
        editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
                                                      Q(ebooks__isnull=False)).distinct()
        if editions_with_ebooks:
@ -240,73 +257,41 @@ def load_doab_edition(title, doab_id, url, format, rights,
        else:
            edition = models.Edition(work=work, title=title)
            edition.save()
-        
+
    # make the edition the selected_edition of the work
    work.selected_edition = edition
    work.save()
-    
-    if format in ('pdf', 'epub', 'mobi'):
+
+    if format in ('pdf', 'epub', 'mobi', 'html', 'online') and rights:
        ebook = models.Ebook()
        ebook.format = format
        ebook.provider = provider
-        ebook.url =  url
+        ebook.url = url
        ebook.rights = rights
        # tie the edition to ebook
        ebook.edition = edition
+        if format == "online":
+            ebook.active = False
        ebook.save()
-    
+
    # update the cover id (could be done separately)
-    cover_url = update_cover_doab(doab_id, edition)
-    
+    cover_url = update_cover_doab(doab_id, edition, redo=False)
+
    # attach more metadata
-    attach_more_doab_metadata(edition, 
-                              description=kwargs.get('description'),
-                              subjects=kwargs.get('subject'),
-                              publication_date=kwargs.get('date'),
-                              publisher_name=kwargs.get('publisher'),
-                              authors=kwargs.get('authors'),)    
-    return ebook
+    attach_more_doab_metadata(
+        edition,
+        description=unlist(kwargs.get('description')),
+        subjects=kwargs.get('subject'),
+        publication_date=unlist(kwargs.get('date')),
+        publisher_name=unlist(kwargs.get('publisher')),
+        authors=kwargs.get('creator'),
+    )
+    return edition

-
-def load_doab_records(fname, limit=None):
-    
-    success_count = 0
-    ebook_count = 0
-    
-    records = json.load(open(fname))
-
-    for (i, book) in enumerate(islice(records,limit)):
-        d = dict(book)
-        d['isbns'] = split_isbns(d['isbns_raw']) # use stricter isbn string parsing.
-        try:
-            ebook = load_doab_edition(**d)
-            success_count += 1 
-            if ebook:
-                ebook_count +=1
-        except Exception, e:
-            logger.error(e)
-            logger.error(book)
-            
-    logger.info("Number of records processed: " + str(success_count))
-    logger.info("Number of ebooks processed: " + str(ebook_count))
-
-"""
+#
 #tools to parse the author lists in doab.csv
-from pandas import DataFrame
-url = "http://www.doabooks.org/doab?func=csv"
-df_csv = DataFrame.from_csv(url)
+#

-out=[]
-for val in df_csv.values:
-    isbn = split_isbns(val[0])
-    if isbn:
-        auths = []
-        if val[2] == val[2] and val[-2] == val[-2]: # test for NaN auths and licenses
-            auths = creator_list(val[2])
-            out.append(( isbn[0], auths))
-open("/Users/eric/doab_auths.json","w+").write(json.dumps(out,indent=2, separators=(',', ': ')))
-"""
-    
 au = re.compile(r'\(Authors?\)', flags=re.U)
 ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
 tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
@ -326,14 +311,14 @@ def fnf(auth):
    if len(parts) == 1:
        return  parts[0].strip()
    elif len(parts) == 2:
-        return u'{} {}'.format(parts[1].strip(),parts[0].strip())
+        return u'{} {}'.format(parts[1].strip(), parts[0].strip())
    else:
-        if parts[1].strip() in ('der','van', 'von', 'de', 'ter'):
-            return u'{} {} {}'.format(parts[2].strip(),parts[1].strip(),parts[0].strip())
+        if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'):
+            return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip())
        #print auth
        #print re.search(namelist,auth).group(0)
-        return u'{} {}, {}'.format(parts[2].strip(),parts[0].strip(),parts[1].strip())
-    
+        return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip())
+

 def creator(auth, editor=False):
    auth = auth.strip()
@ -349,68 +334,100 @@ def creator(auth, editor=False):
        return [u'dsr', fnf(ds.sub(u'', auth))]
    if re.search(cm, auth):
        return [u'com', fnf(cm.sub(u'', auth))]
-    
+
    auth = au.sub('', auth)
    return ['aut', fnf(auth)]

-def split_auths(auths):
-    if ';' in auths or '/' in auths:
-        return namesep2.split(auths)
-    else:
-        nl = namelist.match(auths.strip())
-        if nl:
-            if nl.group(3).endswith(' de') \
-                or ' de ' in nl.group(3) \
-                or nl.group(3).endswith(' da') \
-                or nl.group(1).endswith(' Jr.') \
-                or ' e ' in nl.group(1):
-                return [auths]
-            else:
-                return namesep.split(auths)
-        else :
-            return [auths]
-
-def split_isbns(isbns):
-    result = []
-    for isbn in isbnsep.split(isbns):
-        isbn = ISBN(isbn)
-        if isbn.valid:
-            result.append(isbn.to_string())
-    return result
-
 def creator_list(creators):
    auths = []
-    if re.search(edlist, creators):
-        for auth in split_auths(edlist.sub(u'', creators)):
-            if auth:
-                auths.append(creator(auth, editor=True))
-    else:
-        for auth in split_auths(unicode(creators)):
-            if auth:
-                auths.append(creator(auth))
+    for auth in creators:
+        auths.append(creator(auth))
    return auths

-def load_doab_auths(fname, limit=None):
-    doab_auths = json.load(open(fname))
-    recnum = 0
-    failed = 0
-    for [isbnraw, authlist] in doab_auths:
-        isbn = ISBN(isbnraw).to_string()
-        try:
-            work = models.Identifier.objects.get(type='isbn',value=isbn).work
-        except models.Identifier.DoesNotExist:
-            print 'isbn = {} not found'.format(isbnraw)
-            failed += 1
-        if work.preferred_edition.authors.all().count() < len(authlist):
-            work.preferred_edition.authors.clear()
-            if authlist is None:
-                print "null authlist; isbn={}".format(isbn)
+DOAB_OAIURL = 'https://www.doabooks.org/oai'
+DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*')
+mdregistry = MetadataRegistry()
+mdregistry.registerReader('oai_dc', oai_dc_reader)
+doab_client = Client(DOAB_OAIURL, mdregistry)
+isbn_cleaner = identifier_cleaner('isbn', quiet=True)
+ISBNSEP = re.compile(r'[/]+')
+
+def add_by_doab(doab_id, record=None):
+    try:
+        record = record if record else doab_client.getRecord(
+            metadataPrefix='oai_dc',
+            identifier='oai:doab-books:{}'.format(doab_id)
+        )
+        metadata = record[1].getMap()
+        isbns = []
+        url = None
+        for ident in metadata.pop('identifier', []):
+            if ident.startswith('ISBN: '):
+                isbn_strings = ISBNSEP.split(ident[6:].strip())
+                for isbn_string in isbn_strings:
+                    isbn = isbn_cleaner(isbn_string)
+                    if isbn:
+                        isbns.append(isbn)
+            elif ident.find('doabooks.org') >= 0:
+                # should already know the doab_id
                continue
-            for [rel,auth] in authlist:
-                work.preferred_edition.add_author(auth, rel)
-        recnum +=1
-        if limit and recnum > limit:
-            break          
-    logger.info("Number of records processed: " + str(recnum))
-    logger.info("Number of missing isbns: " + str(failed))
-        
+            else:
+                url = ident
+        language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None)))
+        urls = online_to_download(url)
+        edition = None
+        title = unlist(metadata.pop('title', None))
+        license = cc.license_from_cc_url(unlist(metadata.pop('rights', None)))
+        for dl_url in urls:
+            format = type_for_url(dl_url)
+            if 'format' in metadata:
+                del metadata['format']
+            edition = load_doab_edition(
+                title,
+                doab_id,
+                dl_url,
+                format,
+                license,
+                language,
+                isbns,
+                url_to_provider(dl_url) if dl_url else None,
+                **metadata
+            )
+        return edition
+    except IdDoesNotExistError:
+        return None
+
+
+def getdoab(url):
+    id_match = DOAB_PATT.search(url)
+    if id_match:
+        return id_match.group(1)
+    return False
+
+def load_doab_oai(from_year=None, limit=100000):
+    '''
+    use oai feed to get oai updates
+    '''
+    if from_year:
+        from_ = datetime.datetime(year=from_year, month=1, day=1)
+    else: 
+        # last 45 days
+        from_ = datetime.datetime.now() - datetime.timedelta(days=45)
+    doab_ids = []
+    for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_):
+        if not record[1]:
+            continue
+        item_type = unlist(record[1].getMap().get('type', None))
+        if item_type != 'book':
+            continue
+        idents = record[1].getMap()['identifier']
+        if idents:
+            for ident in idents:
+                doab = getdoab(ident)
+                if doab:
+                    doab_ids.append(doab)
+                    e = add_by_doab(doab, record=record)
+                    title = e.title if e else None
+                    logger.info(u'updated:\t{}\t{}'.format(doab, title))
+        if len(doab_ids) > limit:
+            break
--- a/core/loaders/doab_utils.py
+++ b/core/loaders/doab_utils.py
@ -0,0 +1,128 @@
+"""
+doab_utils.py
+
+"""
+
+import re
+import urlparse
+
+import requests
+
+from regluit.utils.lang import get_language_code
+from .utils import get_soup
+
+# utility functions for converting lists of individual items into individual items
+
+# let's do a mapping of the DOAB languages into the language codes used 
+# mostly, we just handle mispellings
+# also null -> xx
+
+EXTRA_LANG_MAP = dict([
+    (u'chinese', 'de'),
+    (u'deutsch', 'de'),
+    (u'eng', 'en'),
+    (u'englilsh', 'en'),
+    (u'englilsh', 'en'),
+    (u'englisch', 'en'),
+    (u'espanol', 'es'),
+    (u'ger', 'de'),
+    (u'fra', 'fr'),
+    (u'fre', 'fr'),
+    (u'francese', 'fr'),
+    (u'ita', 'it'),
+    (u'italiano', 'it'),
+    (u'norwegian', 'no'),
+    (u'por', 'pt'),
+    (u'portugese', 'pt'),
+    (u'slovene', 'sl'),
+    (u'spa', 'es'),
+    (u'spagnolo', 'es'),
+])
+
+sep = re.compile(r'[ \-;^,/]+')
+def doab_lang_to_iso_639_1(lang):
+    if lang is None or not lang:
+        return "xx"
+    else:
+        lang = sep.split(lang)[0]
+        code = get_language_code(lang)
+        if code:
+            return code
+        else:
+            return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
+
+
+DOMAIN_TO_PROVIDER = dict([
+    [u'antropologie.zcu.cz', u'AntropoWeb'],
+    [u'books.mdpi.com', u'MDPI Books'],
+    [u'books.openedition.org', u'OpenEdition Books'],
+    [u'books.scielo.org', u'SciELO'],
+    [u'ccdigitalpress.org', u'Computers and Composition Digital Press'],
+    [u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'],
+    [u'dl.dropboxusercontent.com', u'Dropbox'],
+    [u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'],
+    [u'dx.doi.org', u'DOI Resolver'],
+    [u'ebooks.iospress.nl', u'IOS Press Ebooks'],
+    [u'hdl.handle.net', u'Handle Proxy'],
+    [u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'],
+    [u'img.mdpi.org', u'MDPI Books'],
+    [u'ledibooks.com', u'LediBooks'],
+    [u'leo.cilea.it', u'LEO '],
+    [u'leo.cineca.it', u'Letteratura Elettronica Online'],
+    [u'link.springer.com', u'Springer'],
+    [u'oapen.org', u'OAPEN Library'],
+    [u'press.openedition.org', u'OpenEdition Press'],
+    [u'windsor.scholarsportal.info', u'Scholars Portal'],
+    [u'www.adelaide.edu.au', u'University of Adelaide'],
+    [u'www.aliprandi.org', u'Simone Aliprandi'],
+    [u'www.antilia.to.it', u'antilia.to.it'],
+    [u'www.aupress.ca', u'Athabasca University Press'],
+    [u'www.bloomsburyacademic.com', u'Bloomsbury Academic'],
+    [u'www.co-action.net', u'Co-Action Publishing'],
+    [u'www.degruyter.com', u'De Gruyter Online'],
+    [u'www.doabooks.org', u'Directory of Open Access Books'],
+    [u'www.dropbox.com', u'Dropbox'],
+    [u'www.ebooks.iospress.nl', u'IOS Press Ebooks'],
+    [u'www.ledizioni.it', u'Ledizioni'],
+    [u'www.maestrantonella.it', u'maestrantonella.it'],
+    [u'www.oapen.org', u'OAPEN Library'],
+    [u'www.openbookpublishers.com', u'Open Book Publishers'],
+    [u'www.palgraveconnect.com', u'Palgrave Connect'],
+    [u'www.scribd.com', u'Scribd'],
+    [u'www.springerlink.com', u'Springer'],
+    [u'www.ubiquitypress.com', u'Ubiquity Press'],
+    [u'www.unimib.it', u'University of Milano-Bicocca'],
+    [u'www.unito.it', u"University of Turin"],
+])
+
+def url_to_provider(url):
+    netloc = urlparse.urlparse(url).netloc
+    return DOMAIN_TO_PROVIDER.get(netloc, netloc)
+
+FRONTIERSIN = re.compile(r'frontiersin.org/books/[^/]+/(\d+)')
+
+def online_to_download(url):
+    urls = []
+    if url.find(u'mdpi.com/books/pdfview/book/') >= 0:
+        doc = get_soup(url)
+        if doc:
+            obj = doc.find('object', type='application/pdf')
+            if obj:
+                urls.append(obj['data'].split('#')[0])
+    elif url.find(u'books.scielo.org/') >= 0:
+        doc = get_soup(url)
+        if doc:
+            obj = doc.find('a', class_='pdf_file')
+            if obj:
+                urls.append(urlparse.urljoin(url, obj['href']))
+            obj = doc.find('a', class_='epub_file')
+            if obj:
+                urls.append(urlparse.urljoin(url, obj['href']))
+    elif FRONTIERSIN.search(url):
+        booknum = FRONTIERSIN.search(url).group(1)
+        urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum))
+        urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum))
+    else:
+        urls.append(url)
+    return urls
+
--- a/core/loaders/hathitrust.py
+++ b/core/loaders/hathitrust.py
@ -26,38 +26,54 @@ class HathitrustScraper(BaseScraper):
            for record in records:
                self.record = record
                return
-            self.record = {}
-
+            self.record = None # probably a hdl not pointing at Hathitrust
+        self.record = None

    def get_downloads(self):
-        dl_a = self.doc.select_one('#fullPdfLink')
-        value = dl_a['href'] if dl_a else None
-        if value:
-            self.set(
-                'download_url_{}'.format('pdf'),
-                'https://babel.hathitrust.org{}'.format(value)
-            )
+        if self.record:
+            dl_a = self.doc.select_one('#fullPdfLink')
+            value = dl_a['href'] if dl_a else None
+            if value:
+                self.set(
+                    'download_url_{}'.format('pdf'),
+                    'https://babel.hathitrust.org{}'.format(value)
+                )
+        return super(HathitrustScraper, self).get_downloads()

    def get_isbns(self):
-        isbn = self.record.get('issn', [])
-        value = identifier_cleaner('isbn', quiet=True)(isbn)
-        return {'print': value} if value else {}
+        if self.record:
+            isbn = self.record.get('issn', [])
+            value = identifier_cleaner('isbn', quiet=True)(isbn)
+            return {'print': value} if value else {}
+        return super(HathitrustScraper, self).get_isbns()

    def get_title(self):
-        self.set('title', self.record.get('title', ''))
+        if self.record:
+            self.set('title', self.record.get('title', ''))
+        return super(HathitrustScraper, self).get_title()

    def get_keywords(self):
-        self.set('subjects', self.record.get('keywords', []))
+        if self.record:
+            self.set('subjects', self.record.get('keywords', []))
+        return super(HathitrustScraper, self).get_keywords()

    def get_publisher(self):
-        self.set('publisher', self.record.get('publisher', ''))
+        if self.record:
+            self.set('publisher', self.record.get('publisher', ''))
+        return super(HathitrustScraper, self).get_publisher()

    def get_pubdate(self):
-        self.set('publication_date', self.record.get('year', ''))
+        if self.record:
+            self.set('publication_date', self.record.get('year', ''))
+        return super(HathitrustScraper, self).get_pubdate()

    def get_description(self):
-        notes = self.record.get('notes', [])
-        self.set('description', '\r'.join(notes))
+        if self.record:
+            notes = self.record.get('notes', [])
+            self.set('description', '\r'.join(notes))
+        return super(HathitrustScraper, self).get_description()

    def get_genre(self):
-        self.set('genre', self.record.get('type_of_reference', '').lower())
+        if self.record:
+            self.set('genre', self.record.get('type_of_reference', '').lower())
+        return super(HathitrustScraper, self).get_genre()
--- a/core/loaders/springer.py
+++ b/core/loaders/springer.py
@ -110,15 +110,19 @@ class SpringerScraper(BaseScraper):
        self.set('publisher', 'Springer')

 search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
-def load_springer(num_pages):
-    def springer_open_books(num_pages):
-        for page in range(1, num_pages+1):
+def load_springer(startpage=1, endpage=None):
+    def springer_open_books(startpage, endpage):
+        endpage = endpage if endpage else startpage + 10
+        for page in range(startpage, endpage + 1):
            url = search_url.format(page)
-            response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
-            if response.status_code == 200:
-                base = response.url
-                doc = BeautifulSoup(response.content, 'lxml')
-                for link in doc.select('a.title'):
-                    book_url = urljoin(base, link['href'])
-                    yield SpringerScraper(book_url)
-    return add_from_bookdatas(springer_open_books(num_pages))
+            try:
+                response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
+                if response.status_code == 200:
+                    base = response.url
+                    doc = BeautifulSoup(response.content, 'lxml')
+                    for link in doc.select('a.title'):
+                        book_url = urljoin(base, link['href'])
+                        yield SpringerScraper(book_url)
+            except requests.exceptions.ConnectionError:
+                print 'couldn\'t connect to %s' % url
+    return add_from_bookdatas(springer_open_books(startpage, endpage))
--- a/core/loaders/tests.py
+++ b/core/loaders/tests.py
@ -0,0 +1,28 @@
+from django.conf import settings
+from django.test import TestCase
+from regluit.core.models import Ebook, Edition, Work
+from .utils import dl_online
+
+class LoaderTests(TestCase):
+    def setUp(self):
+        pass
+
+    def test_downloads(self):
+        if not (settings.TEST_INTEGRATION):
+            return
+
+        work = Work(title="online work")
+        work.save()
+
+        edition = Edition(work=work)
+        edition.save()
+
+        dropbox_url = 'https://www.dropbox.com/s/h5jzpb4vknk8n7w/Jakobsson_The_Troll_Inside_You_EBook.pdf?dl=0'
+        dropbox_ebook = Ebook.objects.create(format='online', url=dropbox_url, edition=edition)
+        dropbox_ebf = dl_online(dropbox_ebook)
+        self.assertTrue(dropbox_ebf.ebook.filesize)
+
+        jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
+        jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
+        jbe_ebf = dl_online(jbe_ebook)
+        self.assertTrue(jbe_ebf.ebook.filesize)
--- a/core/loaders/utils.py
+++ b/core/loaders/utils.py
@ -1,15 +1,23 @@
 import csv
-import re
-import requests
 import logging
-import sys
+import re
+import time
 import unicodedata
+import urlparse
+
+from bs4 import BeautifulSoup
+import requests
+
+from django.conf import settings
+from django.core.files.base import ContentFile

-from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
-from regluit.core.isbn import ISBN
-from regluit.core.bookloader import add_by_isbn_from_google, merge_works
 from regluit.api.crosswalks import inv_relator_contrib
 from regluit.bisac.models import BisacHeading
+from regluit.core.bookloader import add_by_isbn_from_google, merge_works
+from regluit.core.isbn import ISBN
+from regluit.core.models import (
+    Ebook, EbookFile, Edition, Identifier, path_for_file, Subject, Work,
+)

 logger = logging.getLogger(__name__)

@ -22,7 +30,7 @@ def utf8_general_ci_norm(s):
    """
    Normalize a la MySQL utf8_general_ci collation
    (As of 2016.05.24, we're using the utf8_general_ci collation for author names)
-    
+
    https://stackoverflow.com/questions/1036454/what-are-the-diffrences-between-utf8-general-ci-and-utf8-unicode-ci/1036459#1036459

    * converts to Unicode normalization form D for canonical decomposition
@ -34,79 +42,84 @@ def utf8_general_ci_norm(s):
    s1 = unicodedata.normalize('NFD', s)
    return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()

+def get_soup(url):
+    response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
+    if response.status_code == 200:
+        return BeautifulSoup(response.content, 'lxml')
+    return None
+
 def get_authors(book):
-    authors=[]
-    if book.get('AuthorsList',''):
+    authors = []
+    if book.get('AuthorsList', ''):
        #UMich
-        for i in range(1,3):
-            fname=u'Author{}First'.format(i)
-            lname=u'Author{}Last'.format(i)
-            role=u'Author{}Role'.format(i)
-            authname = u'{} {}'.format(book[fname],book[lname])
+        for i in range(1, 3):
+            fname = u'Author{}First'.format(i)
+            lname = u'Author{}Last'.format(i)
+            role = u'Author{}Role'.format(i)
+            authname = u'{} {}'.format(book[fname], book[lname])
            if authname != u' ':
                role = book[role] if book[role].strip() else 'A01'
-                authors.append((authname,role))
+                authors.append((authname, role))
            else:
                break
        authlist = book["AuthorsList"].replace(' and ', ', ').split(', ')
-        if len(authlist)>3:
+        if len(authlist) > 3:
            for authname in authlist[3:]:
                authors.append((authname, 'A01'))
    else:
        #OBP
-        for i in range(1,6):
-            fname= book.get(u'Contributor {} first name'.format(i), '')
-            lname= book.get(u'Contributor {} surname'.format(i), '')
-            role= book.get(u'ONIX Role Code (List 17){}'.format(i), '')
-            authname = u'{} {}'.format(fname,lname)
+        for i in range(1, 6):
+            fname = book.get(u'Contributor {} first name'.format(i), '')
+            lname = book.get(u'Contributor {} surname'.format(i), '')
+            role = book.get(u'ONIX Role Code (List 17){}'.format(i), '')
+            authname = u'{} {}'.format(fname, lname)
            if authname != u' ':
                role = role if role.strip() else 'A01'
-                authors.append((authname,role))
+                authors.append((authname, role))
            else:
                break
    return authors

 def get_subjects(book):
-    subjects=[]
-    for i in range(1,5):
+    subjects = []
+    for i in range(1, 5):
        key = u'BISACCode{}'.format(i)  #UMich dialect
        key2 = u'BISAC subject code {}'.format(i) #OBP dialect
-        code = book.get(key,'')
-        code = code if code else book.get(key2,'')
+        code = book.get(key, '')
+        code = code if code else book.get(key2, '')
        if code != '':
            try:
-                bisac=BisacHeading.objects.get(notation=code)
+                bisac = BisacHeading.objects.get(notation=code)
                subjects.append(bisac)
            except BisacHeading.DoesNotExist:
-                logger.warning( "Please add BISAC {}".format(code))
+                logger.warning("Please add BISAC {}".format(code))
    return subjects

 def add_subject(subject_name, work, authority=''):
    try:
-        subject= Subject.objects.get(name=subject_name)
+        subject = Subject.objects.get(name=subject_name)
    except Subject.DoesNotExist:
-        subject=Subject.objects.create(name=subject_name, authority=authority)
+        subject = Subject.objects.create(name=subject_name, authority=authority)
    subject.works.add(work)

 def get_title(book):
-    title = book.get('FullTitle','') #UMICH
+    title = book.get('FullTitle', '') #UMICH
    if title:
        return title
-    title = book.get('Title','') #OBP
-    sub = book.get('Subtitle','')
+    title = book.get('Title', '') #OBP
+    sub = book.get('Subtitle', '')
    if sub:
-        return u'{}: {}'.format(title,sub)
-    else:
-        return title
-        
+        return u'{}: {}'.format(title, sub)
+    return title
+
 def get_cover(book):
-    cover_url =  book.get('Cover URL','') #OBP
+    cover_url = book.get('Cover URL', '') #OBP
    if cover_url:
        return cover_url
    url = book['URL']
    if "10.3998" in url:
        # code for umich books; can generalize, of course!
-        idmatch= re.search( r'([^/]+)\.(\d+\.\d+\.\d+)', url)
+        idmatch = re.search(r'([^/]+)\.(\d+\.\d+\.\d+)', url)
        if idmatch:
            book_id = idmatch.group(2)
            if idmatch.group(1) == 'ohp':
@ -116,74 +129,78 @@ def get_cover(book):
            else:
                cover_url = "http://quod.lib.umich.edu/d/dculture/images/{}.jpg".format(book_id)
            cover = requests.head(cover_url)
-            if cover.status_code<400:
+            if cover.status_code < 400:
                return cover_url
            else:
-                logger.warning( "bad cover: {} for: {}".format(cover_url, url))
-            
+                logger.warning("bad cover: {} for: {}".format(cover_url, url))
+
 def get_isbns(book):
    isbns = []
    edition = None
    #'ISBN 1' is OBP, others are UMICH
-    for code in ['eISBN', 'ISBN 3','PaperISBN', 'ISBN 2', 'ClothISBN', 'ISBN 1', 'ISBN 4', 'ISBN 5']:
-        if book.get(code, '') not in ('','N/A'):
+    for code in ['eISBN', 'ISBN 3', 'PaperISBN', 'ISBN 2', 'ClothISBN',
+                 'ISBN 1', 'ISBN 4', 'ISBN 5'
+                ]:
+        if book.get(code, '') not in ('', 'N/A'):
            values = book[code].split(',')
            for value in values:
                isbn = ISBN(value).to_string()
                if isbn:
                    isbns.append(isbn)
-    for isbn in isbns :
+    for isbn in isbns:
        if not edition:
            edition = Edition.get_by_isbn(isbn)
-    return (isbns, edition )
+    return (isbns, edition)

 def get_pubdate(book):
-    value = book.get('CopyrightYear','') #UMICH
+    value = book.get('CopyrightYear', '') #UMICH
    if value:
        return value
-    value = book.get('publication year','') #OBP
-    sub = book.get('publication month','')
-    sub2 = book.get('publication day','')
+    value = book.get('publication year', '') #OBP
+    sub = book.get('publication month', '')
+    sub2 = book.get('publication day', '')
    if sub2:
-        return u'{}-{}-{}'.format(value,sub,sub2)
+        return u'{}-{}-{}'.format(value, sub, sub2)
    elif sub:
-        return u'{}-{}'.format(value,sub,sub2)
-    else:
-        return value
-        
+        return u'{}-{}'.format(value, sub, sub2)
+    return value
+
 def get_publisher(book):
-    value = book.get('Publisher','')
+    value = book.get('Publisher', '')
    if value:
        return value
-    if book.get('DOI prefix','')=='10.11647':
+    if book.get('DOI prefix', '') == '10.11647':
        return "Open Book Publishers"
-        
+
 def get_url(book):
-    url = book.get('URL','')
-    url = url if url else u'https://doi.org/{}/{}'.format( book.get('DOI prefix',''),book.get('DOI suffix',''))
+    url = book.get('URL', '')
+    url = url if url else u'https://doi.org/{}/{}'.format(
+        book.get('DOI prefix', ''),
+        book.get('DOI suffix', '')
+    )
    return url

 def get_description(book):
-    value = book.get('DescriptionBrief','')
-    value = value if value else book.get('Plain Text Blurb','')
+    value = book.get('DescriptionBrief', '')
+    value = value if value else book.get('Plain Text Blurb', '')
    return value

 def get_language(book):
-    value = book.get('ISO Language Code','')
+    value = book.get('ISO Language Code', '')
    return value

-        
+
 def load_from_books(books):
    ''' books is an iterator of book dicts.
        each book must have attributes
        (umich dialect)
-        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList, 
-        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last, 
-        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong, 
-        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate, 
-        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights, 
+        eISBN, ClothISBN, PaperISBN, Publisher, FullTitle, Title, Subtitle, AuthorsList,
+        Author1Last, Author1First, Author1Role, Author2Last, Author2First, Author2Role, Author3Last,
+        Author3First, Author3Role, AuthorBio, TableOfContents, Excerpt, DescriptionLong,
+        DescriptionBrief, BISACCode1, BISACCode2, BISACCode3, CopyrightYear, ePublicationDate,
+        eListPrice, ListPriceCurrencyType, List Price in USD (paper ISBN), eTerritoryRights,
        SubjectListMARC, , Book-level DOI, URL,	License
-        
+
        '''

    # Goal: get or create an Edition and Work for each given book
@ -194,21 +211,21 @@ def load_from_books(books):

        # try first to get an Edition already in DB with by one of the ISBNs in book
        (isbns, edition) = get_isbns(book)
-        if len(isbns)==0:
+        if not isbns:
            continue
-        title=get_title(book)
+        title = get_title(book)
        authors = get_authors(book)

-        # if matching by ISBN doesn't work, then create a Work and Edition 
+        # if matching by ISBN doesn't work, then create a Work and Edition
        # with a title and the first ISBN
        if not edition:
            work = Work(title=title)
            work.save()
-            edition= Edition(title=title, work=work) 
+            edition = Edition(title=title, work=work)
            edition.save()
            Identifier.set(type='isbn', value=isbns[0], edition=edition, work=work)

-        work=edition.work
+        work = edition.work

        # at this point, work and edition exist
        url = get_url(book)
@ -222,7 +239,7 @@ def load_from_books(books):
            if edition and edition.work != work:
                work = merge_works(work, edition.work)
            if not edition:
-                edition= Edition(title=title, work=work)
+                edition = Edition(title=title, work=work)
                edition.save()
                Identifier.set(type='isbn', value=isbn, edition=edition, work=work)

@ -234,18 +251,18 @@ def load_from_books(books):
            edition.save()
            edition.set_publisher(get_publisher(book))

-        # possibly replace work.description 
+        # possibly replace work.description
        description = get_description(book)
-        if len(description)>len (work.description):
+        if len(description) > len(work.description):
            work.description = description
            work.save()
-        
+
        # set language
-        lang= get_language(book)
+        lang = get_language(book)
        if lang:
            work.language = lang
            work.save()
-        
+
        # add a bisac subject (and ancestors) to work
        for bisacsh in get_subjects(book):
            while bisacsh:
@ -258,13 +275,13 @@ def load_from_books(books):
        results.append((book, work, edition))

        try:
-            logger.info (u"{} {} {}\n".format(i, title, loading_ok))
+            logger.info(u"{} {} {}\n".format(i, title, loading_ok))
        except Exception as e:
-            logger.info (u"{} {}\n".format(i, title, str(e) ))
+            logger.info(u"{} {} {}\n".format(i, title, str(e)))

    return results

-    
+
 def loaded_book_ok(book, work, edition):

    isbns = get_isbns(book)[0]
@ -277,10 +294,10 @@ def loaded_book_ok(book, work, edition):
    try:
        url_id = Identifier.objects.get(type='http', value=get_url(book))
        if url_id is None:
-            logger.info ("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
+            logger.info("url_id problem: work.id {}, url: {}".format(work.id, get_url(book)))
            return False
    except Exception as e:
-        logger.info (str(e))
+        logger.info(str(e))
        return False

    # isbns
@ -292,15 +309,17 @@ def loaded_book_ok(book, work, edition):
            try:
                edition_for_isbn = Identifier.objects.get(type='isbn', value=isbn).edition
            except Exception as e:
-                print (e)
+                logger.info(e)
                return False

            # authors
            # print set([ed.name for ed in edition_for_isbn.authors.all()])

-            if (set([utf8_general_ci_norm(author[0]) for author in authors]) != 
-                   set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])):
-                print "problem with authors"
+            if (
+                    set([utf8_general_ci_norm(author[0]) for author in authors]) !=
+                    set([utf8_general_ci_norm(ed.name) for ed in edition_for_isbn.authors.all()])
+            ):
+                logger.info("problem with authors")
                return False

            try:
@ -312,7 +331,7 @@ def loaded_book_ok(book, work, edition):

    # work description
    description = get_description(book)
-    if not ((work.description == description) or (len(description) <len (work.description))):
+    if not ((work.description == description) or (len(description) < len(work.description))):
        return False

    # bisac
@ -331,14 +350,15 @@ def loaded_book_ok(book, work, edition):
    return True

 ID_URLPATTERNS = {
-    'goog': re.compile(r'[\./]google\.com/books\?.*id=([a-zA-Z0-9\-_]{12})'),
-    'olwk': re.compile(r'[\./]openlibrary\.org(/works/OL\d{1,8}W)'),
-    'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(\d{1,8})'),
-    'ltwk': re.compile(r'[\./]librarything\.com/work/(\d{1,8})'),
-    'oclc': re.compile(r'\.worldcat\.org/.*oclc/(\d{8,12})'),
-    'doi': re.compile(r'[\./]doi\.org/(10\.\d+/\S+)'),
-    'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(\d{1,6})'),
-    'glue': re.compile(r'[\./]unglue\.it/work/(\d{1,7})'),
+    'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P<id>[a-zA-Z0-9\-_]{12})'),
+    'olwk': re.compile(r'[\./]openlibrary\.org(?P<id>/works/OL\d{1,8}W)'),
+    'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P<id>\d{1,8})'),
+    'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P<id>\d{1,8})'),
+    'ltwk': re.compile(r'[\./]librarything\.com/work/(?P<id>\d{1,8})'),
+    'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P<id>\d{8,12})'),
+    'doi': re.compile(r'[\./]doi\.org/(?P<id>10\.\d+/\S+)'),
+    'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(?P<id>\d{1,6})'),
+    'glue': re.compile(r'[\./]unglue\.it/work/(?P<id>\d{1,7})'),
 }

 def ids_from_urls(url):
@ -346,7 +366,128 @@ def ids_from_urls(url):
    for ident in ID_URLPATTERNS.keys():
        id_match = ID_URLPATTERNS[ident].search(url)
        if id_match:
-            ids[ident] = id_match.group(1)
+            ids[ident] = id_match.group('id')
    return ids
-        
-    
+
+DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
+
+def dl_online(ebook):
+    if ebook.format != 'online':
+        pass
+    elif ebook.url.find(u'dropbox.com/s/') >= 0:
+        response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
+        if response.status_code == 200:
+            match_dl = DROPBOX_DL.search(response.content)
+            if match_dl:
+                return make_dl_ebook(match_dl.group(1), ebook)
+            else:
+                logger.warning('couldn\'t get {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get dl for {}'.format(ebook.url))
+
+    elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
+        doc = get_soup(ebook.url)
+        if doc:
+            obj = doc.select_one('div.fulltexticoncontainer-PDF a')
+            if obj:
+                dl_url = urlparse.urljoin(ebook.url, obj['href'])
+                return make_dl_ebook(dl_url, ebook)
+            else:
+                logger.warning('couldn\'t get dl_url for {}'.format(ebook.url))
+        else:
+            logger.warning('couldn\'t get soup for {}'.format(ebook.url))
+
+    return None, False
+
+def make_dl_ebook(url, ebook):
+    if EbookFile.objects.filter(source=ebook.url):
+        return EbookFile.objects.filter(source=ebook.url)[0], False
+    response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
+    if response.status_code == 200:
+        filesize = int(response.headers.get("Content-Length", 0))
+        filesize = filesize if filesize else None
+        format = type_for_url(url, content_type=response.headers.get('content-type'))
+        if format != 'online':
+            new_ebf = EbookFile.objects.create(
+                edition=ebook.edition,
+                format=format,
+                source=ebook.url,
+            )
+            new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
+            new_ebf.save()
+            new_ebook = Ebook.objects.create(
+                edition=ebook.edition,
+                format=format,
+                provider='Unglue.it',
+                url=new_ebf.file.url,
+                rights=ebook.rights,
+                filesize=filesize,
+                version_label=ebook.version_label,
+                version_iter=ebook.version_iter,
+            )
+            new_ebf.ebook = new_ebook
+            new_ebf.save()
+            return new_ebf, True
+        else:
+            logger.warning('download format for {} is not ebook'.format(url))
+    else:
+        logger.warning('couldn\'t get {}'.format(url))
+    return None, False
+
+def type_for_url(url, content_type=None):
+    if not url:
+        return ''
+    if url.find('books.openedition.org') >= 0:
+        return 'online'
+    if Ebook.objects.filter(url=url):
+        return Ebook.objects.filter(url=url)[0].format
+    ct = content_type if content_type else contenttyper.calc_type(url)
+    if re.search("pdf", ct):
+        return "pdf"
+    elif re.search("octet-stream", ct) and re.search("pdf", url, flags=re.I):
+        return "pdf"
+    elif re.search("octet-stream", ct) and re.search("epub", url, flags=re.I):
+        return "epub"
+    elif re.search("text/plain", ct):
+        return "text"
+    elif re.search("text/html", ct):
+        if url.find('oapen.org/view') >= 0:
+            return "html"
+        return "online"
+    elif re.search("epub", ct):
+        return "epub"
+    elif re.search("mobi", ct):
+        return "mobi"
+    return "other"
+
+class ContentTyper(object):
+    """ """
+    def __init__(self):
+        self.last_call = dict()
+
+    def content_type(self, url):
+        try:
+            r = requests.head(url)
+            return r.headers.get('content-type', '')
+        except:
+            return ''
+
+    def calc_type(self, url):
+        delay = 1
+        # is there a delay associated with the url
+        netloc = urlparse.urlparse(url).netloc
+
+        # wait if necessary
+        last_call = self.last_call.get(netloc)
+        if last_call is not None:
+            now = time.time()
+            min_time_next_call = last_call + delay
+            if min_time_next_call > now:
+                time.sleep(min_time_next_call-now)
+
+        self.last_call[netloc] = time.time()
+
+        # compute the content-type
+        return self.content_type(url)
+
+contenttyper = ContentTyper()
--- a/core/management/commands/add_missing_doab_covers.py
+++ b/core/management/commands/add_missing_doab_covers.py
@ -5,18 +5,18 @@ from regluit.core.models import Work
 from regluit.core.loaders.doab import update_cover_doab

 class Command(BaseCommand):
-    help = "make covers for doab editions"
+    help = "make covers for doab editions with bad covers"
    
    def handle(self, **options):
-
-        works = Work.objects.filter(selected_edition__isnull=False, selected_edition__cover_image__isnull=True)
-        #.filter(selected_edition__isnull=False, selected_edition__cover_image__isnull=True)
-        #.exclude(selected_edition__identifiers__type='goog')
-        added = 0
-        for (i, work) in enumerate(works):
-            if work.doab and work.selected_edition.googlebooks_id == '':
-                update_cover_doab(work.doab, work.selected_edition)
-                added += 1
-            print ('\r {}:{}'.format(i, added), end='')
- 
-        print('added {} covers'.format(added))
+        works = Work.objects.filter(identifiers__type='doab').distinct()
+        print('checking {} works with doab'.format(works.count()))
+        num = 0
+        for work in works:
+            if not work.cover_image_thumbnail():
+                update_cover_doab(work.doab, work.preferred_edition, store_cover=True)
+                #print(work.doab)
+                num += 1
+                if num % 10 == 0:
+                    print('{} doab covers updated'.format(num))
+                    #break
+        print('Done: {} doab covers updated'.format(num))
--- a/core/management/commands/clean_subjects.py
+++ b/core/management/commands/clean_subjects.py
@ -1,6 +1,7 @@
 from django.core.management.base import BaseCommand

 from regluit.core.models import Subject
+from regluit.core.validation import valid_subject



@ -27,3 +28,8 @@ class Command(BaseCommand):
            for work in subject.works.all():
                Subject.set_by_name(subject.name, work=work)
            subject.delete()
+
+       period_subjects = Subject.objects.filter(name__contains=".")
+       for subject in period_subjects:
+            if not valid_subject(subject.name):
+                subject.delete()
--- a/core/management/commands/doab_load_auths.py
+++ b/core/management/commands/doab_load_auths.py
@ -1,17 +0,0 @@
-import os
-
-from django.conf import settings
-from django.contrib.auth.models import User
-from django.core.management.base import BaseCommand
-
-from regluit.core.loaders import doab
-
-class Command(BaseCommand):
-    help = "load doab auths"
-    args = "<limit> <file_name>"
-    
-    def handle(self, limit=None, file_name="../../../bookdata/doab_auths.json", **options):
-
-        command_dir =  os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(command_dir, file_name)
-        doab.load_doab_auths(file_path, limit=int(limit) if limit else None)
--- a/core/management/commands/doab_load_books.py
+++ b/core/management/commands/doab_load_books.py
@ -1,17 +0,0 @@
-import os
-
-from django.conf import settings
-from django.contrib.auth.models import User
-from django.core.management.base import BaseCommand
-
-from regluit.core.loaders import doab
-
-class Command(BaseCommand):
-    help = "load doab books"
-    args = "<limit> <file_name>"
-    
-    def handle(self, limit=None, file_name="../../../bookdata/doab.json", **options):
-
-        command_dir =  os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(command_dir, file_name)
-        doab.load_doab_records(file_path, limit=int(limit))
--- a/core/management/commands/harvest_online_ebooks.py
+++ b/core/management/commands/harvest_online_ebooks.py
@ -0,0 +1,21 @@
+from django.core.management.base import BaseCommand
+
+from regluit.core.loaders.utils import dl_online
+from regluit.core.models import Ebook
+
+class Command(BaseCommand):
+    help = "harvest downloadable ebooks from 'online' ebooks"
+    args = "<limit>"
+    
+    def handle(self, limit=0, **options):
+        limit = int(limit) if limit else 0
+        onlines = Ebook.objects.filter(format='online')
+        done = 0
+        for online in onlines:
+            new_ebf, new = dl_online(online)
+            if new_ebf and new:
+                done += 1
+                if done > limit:
+                    break
+        print 'harvested {} ebooks'.format(done)
+        
--- a/core/management/commands/load_books_from_sitemap.py
+++ b/core/management/commands/load_books_from_sitemap.py
@ -30,9 +30,9 @@ class Command(BaseCommand):
            books = []
            for sitemap in content:
                added = add_by_sitemap(sitemap.strip(), maxnum=max)
-                max = max - len(added)
+                max = max - len(added) if max else max
                books =  books + added
-                if max < 0:
+                if max and max < 0:
                    break
        else:
            books = add_by_sitemap(url, maxnum=max)  
--- a/core/management/commands/load_books_springer.py
+++ b/core/management/commands/load_books_springer.py
@ -4,9 +4,9 @@ from regluit.core.loaders.springer import load_springer

 class Command(BaseCommand):
    help = "load books from springer open"
-    args = "<pages>"
+    args = "<startpage> <endpage>"


-    def handle(self, pages, **options):
-        books = load_springer(int(pages))        
+    def handle(self, startpage, endpage=0, **options):        
+        books = load_springer(int(startpage), int(endpage))       
        print "loaded {} books".format(len(books))
--- a/core/management/commands/load_by_doab.py
+++ b/core/management/commands/load_by_doab.py
@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+
+from regluit.core.loaders import doab
+
+class Command(BaseCommand):
+    help = "load doab books by doab_id via oai"
+    args = "<doab_id>"
+    
+    def handle(self, doab_id, **options):
+        doab.add_by_doab(doab_id)
--- a/core/management/commands/load_doab.py
+++ b/core/management/commands/load_doab.py
@ -0,0 +1,18 @@
+from django.core.management.base import BaseCommand
+
+from regluit.core.loaders import doab
+
+class Command(BaseCommand):
+    help = "load doab books via oai"
+    args = "<from_year> <limit>"
+    
+    def handle(self, from_year= None, limit=None, **options):
+        from_year = int(from_year) if from_year else None
+        limit = int(limit) if limit else None
+        if limit:
+            doab.load_doab_oai(from_year=from_year, limit=limit)
+        else:
+            if from_year:
+                doab.load_doab_oai(from_year=from_year)
+            else:
+                doab.load_doab_oai()
--- a/core/management/commands/random_campaigns.py
+++ b/core/management/commands/random_campaigns.py
@ -4,9 +4,9 @@ from random import randint, randrange

 from django.conf import settings
 from django.core.management.base import BaseCommand
+from django.utils.timezone import now

 from regluit.core.models import Work, Campaign
-from regluit.utils.localdatetime import now

 class Command(BaseCommand):
    help = "creates random campaigns for any works that lack one for testing"
--- a/core/models/init.py
+++ b/core/models/init.py
@ -26,6 +26,7 @@ from django.core.files.base import ContentFile
 from django.db import models
 from django.db.models import F, Q
 from django.db.models.signals import post_save
+from django.utils.timezone import now
 from django.utils.translation import ugettext_lazy as _

 #regluit imports
@ -45,8 +46,9 @@ from regluit.payment.parameters import (
    TRANSACTION_STATUS_FAILED,
    TRANSACTION_STATUS_INCOMPLETE
 )
+
 from regluit.utils import encryption as crypto
-from regluit.utils.localdatetime import now, date_today
+from regluit.utils.localdatetime import date_today

 from regluit.core.parameters import (
    REWARDS,
--- a/core/models/bibmodels.py
+++ b/core/models/bibmodels.py
@ -20,10 +20,10 @@ from django.core.urlresolvers import reverse
 from django.db import models
 from django.db.models import F
 from django.db.models.signals import post_save, pre_delete
+from django.utils.timezone import now

 import regluit
 from regluit.marc.models import MARCRecord as NewMARC
-from regluit.utils.localdatetime import now
 from questionnaire.models import Landing

 from regluit.core import mobi
@ -1082,8 +1082,7 @@ class EbookFile(models.Model):
            asking=self.asking,
            source=self.file.url
        )
-
-        new_mobi_ebf.file.save(path_for_file('ebf', None), mobi_cf)
+        new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
        new_mobi_ebf.save()
        if self.ebook:
            new_ebook = Ebook.objects.create(
--- a/core/parameters.py
+++ b/core/parameters.py
@ -42,7 +42,7 @@ OTHER_ID_CHOICES = (
    ('edid', 'pragmatic edition ID'),
 )

-WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http')
+WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http', 'doab')

 ID_CHOICES_MAP = dict(ID_CHOICES)

--- a/core/signals.py
+++ b/core/signals.py
@ -22,6 +22,7 @@ from django.db.utils import DatabaseError
 from django.dispatch import Signal
 from django.utils.translation import ugettext_noop as _
 from django.template.loader import render_to_string
+from django.utils.timezone import now

 from notification import models as notification

@ -29,9 +30,9 @@ from notification import models as notification
 regluit imports
 """
 from regluit.payment.signals import transaction_charged, transaction_failed, pledge_modified, pledge_created
-from regluit.utils.localdatetime import now, date_today
 from regluit.core.parameters import REWARDS, BUY2UNGLUE, THANKS, LIBRARY, RESERVE, THANKED
 from regluit.libraryauth.models import Library, LibraryUser
+from regluit.utils.localdatetime import date_today

 logger = logging.getLogger(__name__)

@ -100,7 +101,7 @@ def create_notice_types( **kwargs):
    notification.create_notice_type("purchase_notgot_gift", _("Your gift wasn't received."), _("The ebook you sent as a gift has not yet been redeemed."))
    notification.create_notice_type("donation", _("Your donation was processed."), _("Thank you, your generous donation has been processed."))
    
-signals.post_syncdb.connect(create_notice_types, sender=notification)
+signals.post_migrate.connect(create_notice_types, sender=notification)

 # define the notifications and tie them to corresponding signals

--- a/core/tasks.py
+++ b/core/tasks.py
@ -13,6 +13,7 @@ django imports
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.mail import send_mail
+from django.utils.timezone import now
 from notification.engine import send_all
 from notification import models as notification

@ -29,8 +30,7 @@ from regluit.core import (
 from regluit.core.models import Campaign, Acq, Gift
 from regluit.core.signals import deadline_impending
 from regluit.core.parameters import RESERVE, REWARDS, THANKS
-
-from regluit.utils.localdatetime import now, date_today
+from regluit.utils.localdatetime import date_today

 logger = logging.getLogger(__name__)

--- a/core/tests.py
+++ b/core/tests.py
--- a/core/validation.py
+++ b/core/validation.py
@ -19,7 +19,7 @@ ID_VALIDATION = {
    'http': (re.compile(r"(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$",
                        flags=re.IGNORECASE|re.S),
             "The Web Address must be a valid http(s) URL."),
-    'isbn':  (r'^([\dxX\-–— ]+|delete)$',
+    'isbn':  (u'^([\\dxX \\-–—‐,;]+|delete)$', #includes unicode hyphen, endash and emdash
              "The ISBN must be a valid ISBN-13."),
    'doab': (r'^(\d{1,6}|delete)$',
             "The value must be 1-6 digits."),
@ -44,8 +44,6 @@ ID_VALIDATION = {
 }

 def isbn_cleaner(value):
-    if value == 'delete':
-        return value
    if not value:
        raise ValidationError('no identifier value found')
    elif value == 'delete':
@ -132,6 +130,8 @@ def valid_xml_char_ordinal(c):
        )

 def valid_subject(subject_name):
+    if len(subject_name) > 200:
+        return False
    num_commas = 0
    for c in subject_name:
        if not valid_xml_char_ordinal(c):
@ -140,6 +140,10 @@ def valid_subject(subject_name):
            num_commas += 1
            if num_commas > 2:
                return False
+    if len(subject_name.split('--')) > 6:
+        return False
+    if len(subject_name.split('. ')) > 4:
+        return False
    return True

 reverse_name_comma = re.compile(r',(?! *Jr[\., ])')
--- a/frontend/forms/bibforms.py
+++ b/frontend/forms/bibforms.py
@ -149,14 +149,27 @@ class EditionForm(forms.ModelForm):
        id_type = self.cleaned_data['id_type']
        id_value = self.cleaned_data.get('id_value','').strip()
        if id_value:
-            identifier = Identifier.objects.filter(type=id_type, value=id_value)
-            if identifier:
-                err_msg = "{} is a duplicate for work #{}.".format(identifier[0], identifier[0].work_id)
-                self.add_error('id_value', forms.ValidationError(err_msg))
            try:
-                self.cleaned_data['id_value'] = identifier_cleaner(id_type)(id_value)
+                id_value = identifier_cleaner(id_type)(id_value)
+                identifier = Identifier.objects.filter(type=id_type, value=id_value)
+                ident = identifier[0] if identifier else None
+                if not ident or not self.instance:
+                    self.cleaned_data['id_value'] = id_value
+                elif ident.edition_id == self.instance.id:
+                    self.cleaned_data['id_value'] = id_value
+                elif not ident.edition_id and ident.work_id == self.instance.work_id:
+                    self.cleaned_data['id_value'] = id_value
+                else:
+                    if ident.edition_id:
+                        err_msg = "{} is a duplicate for edition #{}.".format(id_value, ident.edition_id)
+                    else:
+                        err_msg = "{} is a duplicate for work #{}.".format(id_value, ident.work_id)
+                    self.add_error('id_value', forms.ValidationError(err_msg))
            except forms.ValidationError, ve:
-                self.add_error('id_value', forms.ValidationError('{}: {}'.format(ve.message, id_value)))
+                self.add_error(
+                    'id_value',
+                    forms.ValidationError('{}: {}'.format(ve.message, id_value))
+                )
        return self.cleaned_data

    class Meta:
--- a/frontend/forms/rh_forms.py
+++ b/frontend/forms/rh_forms.py
@ -13,11 +13,11 @@ from django.conf import settings
 from django.forms.extras.widgets import SelectDateWidget
 from django.forms.widgets import RadioSelect
 from django.utils.translation import ugettext_lazy as _
+from django.utils.timezone import now

 from regluit.core.lookups import OwnerLookup
 from regluit.core.models import Campaign, Edition, Claim, RightsHolder, WasWork
 from regluit.core.parameters import *
-from regluit.utils.localdatetime import now

 class RightsHolderForm(forms.ModelForm):
    email = forms.EmailField(
--- a/frontend/templates/base.html
+++ b/frontend/templates/base.html
@ -168,7 +168,7 @@
    <div class="column show-for-medium">
        <span>Contact</span>
        <ul>
-            <li> <a href="mailto:info@ebookfoundation.org"><i class="fa fa-envelope fa-2x"></i></a> <a href="https://twitter.com/unglueit"><i class="fa fa-twitter fa-2x"></i></a> <a href="https://facebook/com/unglueit"><i class="fa fa-facebook fa-2x"></i></a></li>
+            <li> <a href="mailto:info@ebookfoundation.org"><i class="fa fa-envelope fa-2x"></i></a> <a href="https://twitter.com/unglueit"><i class="fa fa-twitter fa-2x"></i></a> <a href="https://facebook.com/unglueit"><i class="fa fa-facebook fa-2x"></i></a></li>
        </ul>
    </div>
 </div>
--- a/frontend/templates/bypub_list.html
+++ b/frontend/templates/bypub_list.html
@ -1,6 +1,6 @@
 {% extends 'work_list.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load lang_utils %}

 {% block title %} Works published by {{ pubname }} {% endblock %}
--- a/frontend/templates/campaign_list.html
+++ b/frontend/templates/campaign_list.html
@ -1,6 +1,6 @@
 {% extends 'base.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load lang_utils %}
 {% load sass_tags %}

--- a/frontend/templates/cc_list.html
+++ b/frontend/templates/cc_list.html
@ -1,6 +1,6 @@
 {% extends 'base.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load lang_utils %}
 {% load sass_tags %}

--- a/frontend/templates/claim.html
+++ b/frontend/templates/claim.html
@ -4,6 +4,7 @@
 {% block doccontent %}

 <h2>Rights Holder Claim Form </h2>
+{% if work %}
 <h3> Rightsholder making claim </h3>
 {{ rights_holder.rights_holder_name }}
 <h3> Work being claimed  </h3>
@ -42,4 +43,7 @@
        <input type="submit" name="submit" value="Confirm Claim">
    </form>
 {% endif %}
+{% else %}
+Please find a work to claim.
+{% endif %}
 {% endblock %}
--- a/frontend/templates/faceted_list.html
+++ b/frontend/templates/faceted_list.html
@ -1,6 +1,6 @@
 {% extends 'base.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load lang_utils %}
 {% load sass_tags %}

--- a/frontend/templates/libraryauth/library.html
+++ b/frontend/templates/libraryauth/library.html
@ -1,6 +1,6 @@
 {% extends 'base.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load sass_tags %}
 {% load truncatechars %}

--- a/frontend/templates/recommended.html
+++ b/frontend/templates/recommended.html
@ -1,6 +1,6 @@
 {% extends 'work_list.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load lang_utils %}

 {% block title %} Books we're recommending. {% endblock %}
--- a/frontend/templates/supporter.html
+++ b/frontend/templates/supporter.html
@ -1,6 +1,6 @@
 {% extends 'base.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load truncatechars %}
 {% load sass_tags %}

--- a/frontend/templates/unglued_list.html
+++ b/frontend/templates/unglued_list.html
@ -1,6 +1,6 @@
 {% extends 'base.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load lang_utils %}
 {% load sass_tags %}

--- a/frontend/templates/work_list.html
+++ b/frontend/templates/work_list.html
@ -1,6 +1,6 @@
 {% extends 'base.html' %}

-{% load endless %}
+{% load el_pagination_tags %}
 {% load lang_utils %}
 {% load sass_tags %}

--- a/frontend/templatetags/bookpanel.py
+++ b/frontend/templatetags/bookpanel.py
@ -1,5 +1,6 @@
 from django import template
-from regluit.utils.localdatetime import now
+from django.utils.timezone import now
+
 from regluit.core.parameters import REWARDS, BUY2UNGLUE

 register = template.Library()
--- a/frontend/templatetags/lang_utils.py
+++ b/frontend/templatetags/lang_utils.py
@ -1,12 +1,6 @@
-"""
-The truncatechars filter is part of Django dev, but we're on 1.3.1
-The following is the filter and its dependencies
-To use this filter, put "{% load truncatechars %}" at the beginning of your template, 
-then {{ myvariable|truncatechars:num }}
-"""
 import unicodedata

-from django.template.base import Library
+from django.template import Library
 from django.template.defaultfilters import stringfilter
 from django.utils.translation import get_language_info

--- a/frontend/templatetags/lib_acqs.py
+++ b/frontend/templatetags/lib_acqs.py
@ -1,4 +1,4 @@
-from regluit.utils.localdatetime import now
+from django.utils.timezone import now
 from django import template
 register = template.Library()

--- a/frontend/templatetags/purchased.py
+++ b/frontend/templatetags/purchased.py
@ -1,5 +1,6 @@
-from regluit.utils.localdatetime import now
 from django import template
+from django.utils.timezone import now
+
 from regluit.core.models import Acq
 register = template.Library()

--- a/frontend/templatetags/truncatechars.py
+++ b/frontend/templatetags/truncatechars.py
@ -7,7 +7,7 @@ then {{ myvariable|truncatechars:num }}
 import unicodedata

 from django import template
-from django.template.base import Library
+from django.template import Library
 from django.template.defaultfilters import stringfilter
 from django.utils.encoding import force_unicode
 from django.utils.functional import allow_lazy, SimpleLazyObject
--- a/frontend/templatetags/urldecode.py
+++ b/frontend/templatetags/urldecode.py
@ -3,7 +3,7 @@
 """
 from urllib import unquote

-from django.template.base import Library
+from django.template import Library
 from django.template.defaultfilters import stringfilter

 register = Library()
--- a/frontend/tests.py
+++ b/frontend/tests.py
@ -13,6 +13,7 @@ from django.core import mail
 from django.core.urlresolvers import reverse
 from django.test import TestCase
 from django.test.client import Client
+from django.utils.timezone import now

 from notification.models import Notice

@ -21,7 +22,6 @@ from regluit.core.models import Work, Campaign, RightsHolder, Claim, Subject
 from regluit.payment.models import Transaction
 from regluit.payment.manager import PaymentManager
 from regluit.payment.stripelib import StripeClient, TEST_CARDS, ERROR_TESTING, card
-from regluit.utils.localdatetime import now

 class WishlistTests(TestCase):
    fixtures = ['initial_data.json', 'neuromancer.json']
--- a/frontend/urls.py
+++ b/frontend/urls.py
@ -35,7 +35,7 @@ urlpatterns = [
    url(r"^rightsholders/campaign/(?P<id>\d+)/mademobi/$", views.manage_campaign, {'action': 'mademobi'}, name="mademobi"),
    url(r"^rightsholders/edition/(?P<work_id>\d*)/(?P<edition_id>\d*)$", views.edit_edition, {'by': 'rh'}, name="rh_edition"),
    url(r"^rightsholders/edition/(?P<edition_id>\d*)/upload/$", views.edition_uploads, name="edition_uploads"),
-    url(r"^rightsholders/claim/$", views.claim, name="claim"), 
+    url(r"^rightsholders/claim/$", login_required(views.claim), name="claim"), 
    url(r"^rightsholders/surveys/$", views.surveys, name="surveys"), 
    url(r"^rightsholders/new_survey/(?P<work_id>\d*)/?$", views.new_survey, name="new_survey"),
    url(r"^rightsholders/surveys/answers_(?P<qid>\d+)_(?P<work_id>\d*).csv$", views.export_surveys, name="survey_answers"),
--- a/frontend/views/init.py
+++ b/frontend/views/init.py
@ -45,6 +45,7 @@ from django.template import TemplateDoesNotExist
 from django.template.loader import render_to_string
 from django.utils.http import urlencode
 from django.utils.translation import ugettext_lazy as _
+from django.utils.timezone import now
 from django.views.decorators.csrf import csrf_exempt
 from django.views.decorators.http import require_POST
 from django.views.generic.edit import FormView
@ -123,11 +124,11 @@ from regluit.payment.parameters import (
    COMPANY_TITLE
 )

-from regluit.utils.localdatetime import now, date_today
 from regluit.libraryauth.forms import UserNamePass
 from regluit.libraryauth.views import Authenticator, superlogin, login_user
 from regluit.libraryauth.models import Library
 from regluit.marc.views import qs_marc_records
+from regluit.utils.localdatetime import date_today
 from questionnaire.models import Landing, Questionnaire
 from questionnaire.views import export_summary as answer_summary, export_csv as export_answers

--- a/frontend/views/bibedit.py
+++ b/frontend/views/bibedit.py
@ -21,6 +21,7 @@ from regluit.core.bookloader import (
 from regluit.core.parameters import WORK_IDENTIFIERS

 from regluit.core.loaders import add_by_webpage
+from regluit.core.loaders.doab import add_by_doab
 from regluit.core.loaders.utils import ids_from_urls
 from regluit.frontend.forms import EditionForm, IdentifierForm

@ -106,6 +107,11 @@ def get_edition_for_id(id_type, id_value, user=None):
        if edition:
            return user_edition(edition, user)
    
+    if identifiers.has_key('doab'):
+        edition = add_by_doab(identifiers['doab'])
+        if edition:
+            return user_edition(edition, user)
+    
    if identifiers.has_key('oclc'):
        edition = add_by_oclc(identifiers['oclc'])
        if edition:
@ -296,11 +302,17 @@ def edit_edition(request, work_id, edition_id, by=None):

                id_type = form.cleaned_data['id_type']
                id_val = form.cleaned_data['id_value']
-                if id_val == 'delete': 
-                    if edition.identifiers.exclude(type=id_type):
-                        edition.identifiers.filter(type=id_type).delete()
+                if id_val == 'delete':
+                    if id_type in WORK_IDENTIFIERS:
+                        if edition.work.identifiers.exclude(type=id_type):
+                            edition.work.identifiers.filter(type=id_type).delete()
+                        else:
+                            alert = ('Can\'t delete identifier -  must have at least one left.')
                    else:
-                        alert = ('Can\'t delete identifier -  must have at least one left.')
+                        if edition.identifiers.exclude(type=id_type):
+                            edition.identifiers.filter(type=id_type).delete()
+                        else:
+                            alert = ('Can\'t delete identifier -  must have at least one left.')
                elif id_val:
                    models.Identifier.set(
                        type=id_type,
--- a/frontend/views/rh_views.py
+++ b/frontend/views/rh_views.py
@ -88,6 +88,8 @@ class ClaimView(CreateView):
        return HttpResponseRedirect(reverse('rightsholders'))

    def get_context_data(self, form):
+        if not form.is_valid():
+            return {'form': form}
        work = form.cleaned_data['work']
        rights_holder = form.cleaned_data['rights_holder']
        active_claims = work.claim.exclude(status = 'release')
--- a/libraryauth/init.py
+++ b/libraryauth/init.py
@ -1 +1,9 @@
-from . import signals
+from django.apps import AppConfig
+
+default_app_config = 'regluit.libraryauth.LibraryAuthConfig'
+
+class LibraryAuthConfig(AppConfig):
+    name = 'regluit.libraryauth'
+
+    def ready(self):
+        from . import signals
--- a/libraryauth/models.py
+++ b/libraryauth/models.py
@ -8,7 +8,7 @@ from django.core import validators
 from django.db import models
 from django.db.models import Q
 from django.db.models.signals import post_save
-from django.forms import IPAddressField as BaseIPAddressField
+from django.forms import GenericIPAddressField as BaseIPAddressField
 from django.utils.translation import ugettext_lazy as _
 from django.core.urlresolvers import reverse

--- a/libraryauth/templatetags/libraryauthtags.py
+++ b/libraryauth/templatetags/libraryauthtags.py
@ -1,6 +1,6 @@
 import unicodedata

-from django.template.base import Library
+from django.template import Library
 from .. import models

 register = Library()
--- a/payment/baseprocessor.py
+++ b/payment/baseprocessor.py
@ -10,12 +10,12 @@ from datetime import timedelta
 django imports
 """
 from django.http import  HttpResponseForbidden
+from django.utils.timezone import now

 """
 regluit imports
 """
 from regluit.payment.models import PaymentResponse
-from regluit.utils.localdatetime import now, zuluformat

 class ProcessorError(Exception):
    """An abstraction around payment processor exceptions"""
--- a/payment/manager.py
+++ b/payment/manager.py
@ -18,6 +18,7 @@ django imports
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.urlresolvers import reverse
+from django.utils.timezone import now

 """
 regluit imports
@ -26,7 +27,6 @@ from regluit.payment import credit
 from regluit.payment.models import Transaction, Receiver, PaymentResponse, Account
 from regluit.payment.parameters import *
 from regluit.payment.signals import transaction_charged, pledge_modified, pledge_created
-from regluit.utils.localdatetime import now

 logger = logging.getLogger(__name__)

--- a/payment/models.py
+++ b/payment/models.py
@ -18,6 +18,7 @@ from django.db.models import Q
 from django.contrib.sites.models import Site
 from django.db.models.signals import post_save, post_delete
 from django.utils.http import urlquote
+from django.utils.timezone import now

 ## django module imports

@ -42,7 +43,7 @@ from regluit.payment.parameters import (
 )

 from regluit.payment.signals import credit_balance_added, pledge_created
-from regluit.utils.localdatetime import now, date_today
+from regluit.utils.localdatetime import date_today

 logger = logging.getLogger(__name__)

--- a/payment/stripelib.py
+++ b/payment/stripelib.py
@ -6,12 +6,15 @@ external library imports
 """
 import logging
 import json
-import re
-import stripe

 from datetime import datetime, timedelta
 from itertools import islice
 from pytz import utc
+import re
+import unittest
+from unittest import TestCase  
+  
+import stripe

 """
 django imports
@ -19,6 +22,7 @@ django imports
 from django.conf import settings
 from django.core.mail import send_mail
 from django.http import HttpResponse
+from django.utils.timezone import now

 """
 regluit imports
@ -35,7 +39,6 @@ from regluit.payment.parameters import (
    TRANSACTION_STATUS_CANCELED
 )
 from regluit.payment.signals import transaction_charged, transaction_failed
-from regluit.utils.localdatetime import now, zuluformat

 # as of 2013.07.15
 # ['charge.disputed', 'coupon.updated'] are legacy events -- don't know whether to
@ -73,12 +76,6 @@ def grouper(iterable, page_size):
 class StripelibError(baseprocessor.ProcessorError):
    pass

-try:
-    import unittest
-    from unittest import TestCase    
-except:
-    from django.test import TestCase
-    from django.utils import unittest

 # if customer.id doesn't exist, create one and then charge the customer
 # we probably should ask our users whether they are ok with our creating a customer id account -- or ask for credit
--- a/payment/tests.py
+++ b/payment/tests.py
@ -5,6 +5,7 @@ import logging
 import os
 import time
 import traceback
+import unittest

 from datetime import timedelta
 from decimal import Decimal as D
@ -19,7 +20,7 @@ from django.contrib.auth.models import User
 from django.core.exceptions import ValidationError
 from django.core.validators import URLValidator
 from django.test import TestCase
-from django.utils import unittest
+from django.utils.timezone import now

 """
 regluit imports
@ -29,7 +30,6 @@ from regluit.core.signals import handle_transaction_charged
 from regluit.payment.manager import PaymentManager
 from regluit.payment.models import Transaction, Account
 from regluit.payment.parameters import *
-from regluit.utils.localdatetime import now

 def setup_selenium():
    # Set the display window for our xvfb
--- a/payment/views.py
+++ b/payment/views.py
@ -13,7 +13,7 @@ django imports
 """
 from django.conf import settings
 from django.contrib.auth.models import User
-from django.contrib.sites.models import RequestSite
+from django.contrib.sites.requests import RequestSite
 from django.core.urlresolvers import reverse
 from django.http import (
    HttpResponse,
@ -24,6 +24,7 @@ from django.http import (
 from django.shortcuts import render_to_response
 from django.template import RequestContext
 from django.test.utils import setup_test_environment
+from django.utils.timezone import now
 from django.views.decorators.csrf import csrf_exempt
 from django.views.generic.edit import FormView
 from django.views.generic.base import TemplateView
@ -38,7 +39,6 @@ from regluit.payment.models import Transaction
 from regluit.payment.parameters import *
 from regluit.payment.stripelib import STRIPE_PK
 from regluit.payment.tests import PledgeTest, AuthorizeTest
-from regluit.utils.localdatetime import now

 logger = logging.getLogger(__name__)

--- a/requirements_versioned.pip
+++ b/requirements_versioned.pip
@ -3,19 +3,14 @@ Fabric==1.6.0
 MySQL-python==1.2.5
 Pillow==3.4.2
 PyJWT==1.4.1
-PyPDF2==1.23
+PyPDF2==1.26
 PyGithub==1.15.0
 PyYAML==3.11
-git+git://github.com/urschrei/pyzotero.git@v0.9.51
-SPARQLWrapper==1.6.4
-WebOb==1.2.3
-WebTest==1.4.0
 amqp==1.4.9
 anyjson==0.3.3
 billiard==3.3.0.23
 awscli==1.10.26
 boto==2.42.0
-#git+ssh://git@github.com/Gluejar/boto.git@2.3.0
 celery==3.1.23
 certifi==2016.2.28
 # pip installing pillow seems to delete distribute
@ -24,36 +19,34 @@ certifi==2016.2.28
 django-celery==3.1.17
 django-ckeditor==4.5.1
 #django-email-change==0.2.3
-git+git://github.com/eshellman/django-email-change.git@1e71dd320504d56b1fc7d447ce4cffb550cedce7
+git+git://github.com/eshellman/django-email-change.git@57169bdef1c8a41d122e2bab2dcd8564b8fb231d
 django-compat==1.0.10
 django-contrib-comments==1.7.1
-django-endless-pagination==2.0
+django-el-pagination==3.2.4
 django-extensions==1.6.1
 django-jsonfield==1.0.0
 #django-kombu==0.9.4
 django-maintenancemode==0.11.2
 django-mptt==0.8.5
-#django-nose-selenium==0.7.3
 #django-notification==0.2
-git+git://github.com/eshellman/django-notification.git@412c7a03a327195a1017c2be92c8e2caabc880b6
+git+git://github.com/eshellman/django-notification.git@a4620e893e2da220994e0189bf5d980bfbdcf0ad
 django-registration==2.1.2
 django-selectable==0.9.0
 django-smtp-ssl==1.0
 django-storages==1.4.1
 django-tastypie==0.13.3
-django-transmeta==0.7.3
-feedparser==5.1.2
+#django-transmeta==0.7.3 
+git+git://github.com/resulto/django-transmeta.git@ad4d7278ba330dcf8c8446f8ae9b2c769ae8684e
 fef-questionnaire==4.0.1
-freebase==1.0.8
 #gitenberg.metadata==0.1.6
 git+https://github.com/gitenberg-dev/gitberg-build
 #git+ssh://git@github.com/gitenberg-dev/metadata.git@0.1.11
 github3.py==0.9.5
-html5lib==1.0b3
+html5lib==1.0.1
 httplib2==0.7.5
 isodate==0.5.1
 kombu==3.0.35
-lxml==2.3.5
+lxml==4.2.1
 defusedxml==0.4.1
 mechanize==0.2.5
 mimeparse==0.1.3
@ -66,6 +59,7 @@ paramiko==1.14.1
 postmonkey==1.0b
 pycrypto==2.6
 pymarc==3.0.2
+pyoai==2.5.0
 pyparsing==2.0.3
 python-dateutil==2.5.3
 python-mimeparse==0.1.4
@ -75,12 +69,12 @@ pytz==2016.6.1
 rdflib==4.2.0
 rdflib-jsonld==0.3
 redis==2.10.3
-reportlab==3.1.8
+reportlab==3.4.0
 requests==2.10.0
 requests-mock==1.2.0
 requests-oauthlib==0.6.2
 selenium==2.53.1
-six==1.9.0
+six==1.11.0
 sorl-thumbnail==12.3
 ssh==1.7.14
 stevedore==1.12.0
@ -89,7 +83,8 @@ virtualenv==1.4.9
 # virtualenv-clone==0.2.4 not sure why I have this in my env
 #virtualenvwrapper==3.6
 wsgiref==0.1.2
-xhtml2pdf==0.0.6
+xhtml2pdf==0.2.2
+webencodings==0.5.1
 #for urllib3 secure
 cffi==1.7.0
 cryptography==2.1.4
--- a/settings/common.py
+++ b/settings/common.py
@ -165,7 +165,7 @@ INSTALLED_APPS = (
    'social.apps.django_app.default',
    'tastypie',
    'djcelery',
-    'endless_pagination',
+    'el_pagination',
    'selectable',
    'regluit.frontend.templatetags',
    'notification',
--- a/settings/dev.py
+++ b/settings/dev.py
@ -29,7 +29,9 @@ DATABASES = {
        'PASSWORD': '',
        'HOST': '',
        'PORT': '',
-        'TEST_CHARSET': 'utf8',
+        'TEST': {
+            'CHARSET': 'utf8',
+        }
    }
 }

--- a/settings/jenkins.py
+++ b/settings/jenkins.py
@ -20,7 +20,9 @@ DATABASES = {
        'PASSWORD': 'regluit',
        'HOST': '',
        'PORT': '',
-        'TEST_CHARSET': 'utf8',
+        'TEST': {
+            'CHARSET': 'utf8',
+        }
    }
 }

--- a/settings/just.py
+++ b/settings/just.py
@ -22,7 +22,9 @@ DATABASES = {
        'PASSWORD': DATABASE_PASSWORD,
        'HOST': DATABASE_HOST,
        'PORT': '',
-        'TEST_CHARSET': 'utf8'
+        'TEST': {
+            'CHARSET': 'utf8',
+        }
    }
 }

--- a/settings/please.py
+++ b/settings/please.py
@ -21,7 +21,9 @@ DATABASES = {
        'PASSWORD': DATABASE_PASSWORD,
        'HOST': DATABASE_HOST,
        'PORT': '',
-        'TEST_CHARSET': 'utf8',
+        'TEST': {
+            'CHARSET': 'utf8',
+        }
    }
 }

--- a/settings/prod.py
+++ b/settings/prod.py
@ -23,7 +23,9 @@ DATABASES = {
        'PASSWORD': DATABASE_PASSWORD,
        'HOST': DATABASE_HOST,
        'PORT': '',
-        'TEST_CHARSET': 'utf8',
+        'TEST': {
+            'CHARSET': 'utf8',
+        }
    }
 }

--- a/utils/init.py
+++ b/utils/init.py
@ -1 +1 @@
-import localdatetime
+
--- a/utils/lang.py
+++ b/utils/lang.py
@ -1,6 +1,10 @@
 from django.conf.global_settings import LANGUAGES

 lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
+code2lang = dict(LANGUAGES)

 def get_language_code(language):
-    return lang2code.get(language.lower().strip(), '')
+    language = language.lower().strip()
+    if language in code2lang:
+        return language
+    return lang2code.get(language, '')
--- a/utils/localdatetime.py
+++ b/utils/localdatetime.py
@ -1,140 +1,8 @@
-"""
-Utility to return datetime.datetime.utcnow() by default but allows for a custom utcnow() (e.g., for testing)
+from django.utils.timezone import now

->>> import regluit
->>> from regluit.utils.localdatetime import now
->>> now()
-datetime.datetime(2012, 3, 8, 14, 0, 35, 409270)
->>> now()
-datetime.datetime(2012, 3, 8, 14, 0, 36, 985271)
->>> n = now()
->>> n
-datetime.datetime(2012, 3, 8, 14, 1, 54, 650679)
->>> regluit.utils.localdatetime._now = lambda: n
->>> now()
-datetime.datetime(2012, 3, 8, 14, 1, 54, 650679)
->>> now()
-datetime.datetime(2012, 3, 8, 14, 1, 54, 650679)
->>> now()
-
-DST handled:
-
->>> ptz = pytz.timezone('America/Los_Angeles')
->>> make_naive(datetime.datetime(2012,03,11,10,tzinfo=utc), ptz)
-datetime.datetime(2012, 3, 11, 3, 0)
->>> make_naive(datetime.datetime(2012,03,11,9,tzinfo=utc), ptz)
-datetime.datetime(2012, 3, 11, 1, 0)
-
->>> make_aware(datetime.datetime(2012,11,4,1,30), ptz)
-Traceback (most recent call last):
-  File "<console>", line 1, in <module>
-  File "/Users/raymondyee/C/src/Gluejar/regluit/utils/localdatetime.py", line 90, in make_aware
-    return timezone.localize(value, is_dst=None)
-  File "/Users/raymondyee/.virtualenvs/regluit/lib/python2.7/site-packages/pytz/tzinfo.py", line 349, in localize
-    raise AmbiguousTimeError(dt)
-AmbiguousTimeError: 2012-11-04 01:30:00
-
-
-"""
-
-import pytz
-import datetime
-import django
-from django.conf import settings
-
-# for Django 1.3.x, return a timestamp naive now()
-# for Django 1.4 should switch to django.utils.timezone.now()
-# see https://code.djangoproject.com/browser/django/trunk/django/utils/timezone.py?rev=17642#L232
-
-def now():
-    if hasattr(settings, 'LOCALDATETIME_NOW') and settings.LOCALDATETIME_NOW is not None:
-        return settings.LOCALDATETIME_NOW()
-    else:
-        try:
-            return django.utils.timezone.now()
-        except AttributeError, e:
-            return datetime.datetime.now()    
    
-# provide a replacement for datetime.date.today()
-# this will be timezone naive -- is that what we really want?
+# switch to  django.utils.timezone.localdate in django 1.11

 def date_today():
    return now().date()

-# borrow a lot of the routines/code that will be in Django 1.4+ django.utils.timezone
-# https://code.djangoproject.com/browser/django/trunk/django/utils/timezone.py
-
-utc = pytz.utc
-
-def get_default_timezone():
-    return pytz.timezone(settings.TIME_ZONE)
-    
-def is_aware(value):
-    """
-    Determines if a given datetime.datetime is aware.
-
-    The logic is described in Python's docs:
-    http://docs.python.org/library/datetime.html#datetime.tzinfo
-    """
-    return value.tzinfo is not None and value.tzinfo.utcoffset(value) is not None
-
-def is_naive(value):
-    """
-    Determines if a given datetime.datetime is naive.
-
-    The logic is described in Python's docs:
-    http://docs.python.org/library/datetime.html#datetime.tzinfo
-    """
-    return value.tzinfo is None or value.tzinfo.utcoffset(value) is None
-
-def make_aware(value, timezone):
-    """
-    Makes a naive datetime.datetime in a given time zone aware.
-    """
-    if hasattr(timezone, 'localize'):
-        # available for pytz time zones
-        return timezone.localize(value, is_dst=None)
-    else:
-        # may be wrong around DST changes
-        return value.replace(tzinfo=timezone)
-
-def make_naive(value, timezone):
-    """
-    Makes an aware datetime.datetime naive in a given time zone.
-    """
-    value = value.astimezone(timezone)
-    if hasattr(timezone, 'normalize'):
-        # available for pytz time zones
-        value = timezone.normalize(value)
-    return value.replace(tzinfo=None)
-
-def isoformat(value):
-    """
-    if value is naive, assume it's in the default_timezone
-    """
-    if is_naive(value):
-        return make_aware(value, get_default_timezone()).isoformat()
-    else:
-        return value.isoformat()
-
-def zuluformat(value):
-    """format value in zulu format -- e.g., 2012-03-26T17:47:22.654449Z"""
-    return "{0}Z".format(as_utc_naive(value).isoformat())
-
-def as_utc_naive(value):
-    """
-    if value is naive, assume it's in the default time zone, then convert to UTC but make naive 
-    """
-    if is_naive(value):
-        return make_naive(make_aware(value, get_default_timezone()), utc)
-    else:
-        return make_naive(value, utc)
-    
-def as_default_timezone_naive(value):
-    """
-    if value is naive, assume it's in UTC and convert to the default tz and make it naive
-    """
-    if is_naive(value):
-        return make_naive(make_aware(value, utc), get_default_timezone())
-    else:
-        return make_naive(value, get_default_timezone())