Merge pull request #766 from Gluejar/doab

Doab OAI harvest
2018-04-16 13:47:08 -04:00 · 2018-04-16 13:47:08 -04:00 · 28c9aaa9b9
parent edc8ddcb94 8dd1fb1822
commit 28c9aaa9b9
19 changed files with 1067 additions and 54874 deletions
--- a/api/onix.py
+++ b/api/onix.py
@ -25,7 +25,7 @@ def onix_feed(facet, max=None):
        editions = facet.facet_object.filter_model("Edition",editions).distinct()
        for edition in editions:
            edition_prod = product(edition, facet.facet_object)
-            if edition_prod:
+            if edition_prod is not None:
                feed.append(edition_prod)    
    return etree.tostring(feed, pretty_print=True)
    
@ -34,7 +34,7 @@ def onix_feed_for_work(work):
    feed.append(header(work))
    for edition in models.Edition.objects.filter(work=work,ebooks__isnull=False).distinct():
        edition_prod = product(edition)
-        if edition_prod:
+        if edition_prod is not None:
            feed.append(product(edition))
    return etree.tostring(feed, pretty_print=True)
    
--- a/bookdata/doab.json
+++ b/bookdata/doab.json
--- a/bookdata/doab_auths.json
+++ b/bookdata/doab_auths.json
--- a/core/bookloader.py
+++ b/core/bookloader.py
@ -49,7 +49,7 @@ def add_by_oclc(isbn, work=None):

 def add_by_oclc_from_google(oclc):
    if oclc:
-        logger.info("adding book by oclc %s", oclc)
+        logger.info(u"adding book by oclc %s", oclc)
    else:
        return None
    try:
@ -59,10 +59,10 @@ def add_by_oclc_from_google(oclc):
        try:
            results = _get_json(url, {"q": '"OCLC%s"' % oclc})
        except LookupFailure, e:
-            logger.exception("lookup failure for %s", oclc)
+            logger.exception(u"lookup failure for %s", oclc)
            return None
        if not results.has_key('items') or not results['items']:
-            logger.warn("no google hits for %s", oclc)
+            logger.warn(u"no google hits for %s", oclc)
            return None

        try:
@ -70,16 +70,16 @@ def add_by_oclc_from_google(oclc):
            models.Identifier(type='oclc', value=oclc, edition=e, work=e.work).save()
            return e
        except LookupFailure, e:
-            logger.exception("failed to add edition for %s", oclc)
+            logger.exception(u"failed to add edition for %s", oclc)
        except IntegrityError, e:
-            logger.exception("google books data for %s didn't fit our db", oclc)
+            logger.exception(u"google books data for %s didn't fit our db", oclc)
        return None

 def valid_isbn(isbn):
    try:
        return identifier_cleaner('isbn')(isbn)
    except:
-        logger.exception("invalid isbn: %s", isbn)
+        logger.exception(u"invalid isbn: %s", isbn)
        return None

 def add_by_isbn(isbn, work=None, language='xx', title=''):
@ -88,13 +88,13 @@ def add_by_isbn(isbn, work=None, language='xx', title=''):
    try:
        e = add_by_isbn_from_google(isbn, work=work)
    except LookupFailure:
-        logger.exception("failed google lookup for %s", isbn)
+        logger.exception(u"failed google lookup for %s", isbn)
        # try again some other time
        return None
    if e:
        return e

-    logger.info("null came back from add_by_isbn_from_google: %s", isbn)
+    logger.info(u"null came back from add_by_isbn_from_google: %s", isbn)

    # if there's a a title, we want to create stub editions and
    # works, even if google doesn't know about it # but if it's not valid,
@ -129,10 +129,10 @@ def get_google_isbn_results(isbn):
    try:
        results = _get_json(url, {"q": "isbn:%s" % isbn})
    except LookupFailure:
-        logger.exception("lookup failure for %s", isbn)
+        logger.exception(u"lookup failure for %s", isbn)
        return None
    if not results.has_key('items') or not results['items']:
-        logger.warn("no google hits for %s", isbn)
+        logger.warn(u"no google hits for %s", isbn)
        return None
    return results

@ -201,7 +201,7 @@ def update_edition(edition):
    # if the language of the edition no longer matches that of the parent work,
    # attach edition to the
    if edition.work.language != language:
-        logger.info("reconnecting %s since it is %s instead of %s",
+        logger.info(u"reconnecting %s since it is %s instead of %s",
            googlebooks_id, language, edition.work.language)
        old_work = edition.work

@ -210,7 +210,7 @@ def update_edition(edition):
        edition.work = new_work
        edition.save()
        for identifier in edition.identifiers.all():
-            logger.info("moving identifier %s", identifier.value)
+            logger.info(u"moving identifier %s", identifier.value)
            identifier.work = new_work
            identifier.save()
        if old_work and old_work.editions.count() == 0:
@ -256,7 +256,7 @@ def add_by_isbn_from_google(isbn, work=None):
        edition.new = False
        return edition

-    logger.info("adding new book by isbn %s", isbn)
+    logger.info(u"adding new book by isbn %s", isbn)
    results = get_google_isbn_results(isbn)
    if results:
        try:
@ -267,9 +267,9 @@ def add_by_isbn_from_google(isbn, work=None):
                isbn=isbn
            )
        except LookupFailure, e:
-            logger.exception("failed to add edition for %s", isbn)
+            logger.exception(u"failed to add edition for %s", isbn)
        except IntegrityError, e:
-            logger.exception("google books data for %s didn't fit our db", isbn)
+            logger.exception(u"google books data for %s didn't fit our db", isbn)
        return None
    return None

@ -320,7 +320,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
    if results:
        item = results
    else:
-        logger.info("loading metadata from google for %s", googlebooks_id)
+        logger.info(u"loading metadata from google for %s", googlebooks_id)
        url = "https://www.googleapis.com/books/v1/volumes/%s" % googlebooks_id
        item = _get_json(url)
    d = item['volumeInfo']
@ -343,7 +343,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
    if len(language) > 5:
        language = language[0:5]
    if work and work.language != language:
-        logger.info("not connecting %s since it is %s instead of %s",
+        logger.info(u"not connecting %s since it is %s instead of %s",
                    googlebooks_id, language, work.language)
        work = None
    # isbn = None
@ -371,7 +371,7 @@ def add_by_googlebooks_id(googlebooks_id, work=None, results=None, isbn=None):
    try:
        e = models.Identifier.objects.get(type='goog', value=googlebooks_id).edition
        e.new = False
-        logger.warning(" whoa nellie, somebody else created an edition while we were working.")
+        logger.warning(u" whoa nellie, somebody else created an edition while we were working.")
        if work.new:
            work.delete()
        return e
@ -404,19 +404,19 @@ def relate_isbn(isbn, cluster_size=1):
    """add a book by isbn and then see if there's an existing work to add it to so as to make a
    cluster bigger than cluster_size.
    """
-    logger.info("finding a related work for %s", isbn)
+    logger.info(u"finding a related work for %s", isbn)

    edition = add_by_isbn(isbn)
    if edition is None:
        return None
    if edition.work is None:
-        logger.info("didn't add related to null work")
+        logger.info(u"didn't add related to null work")
        return None
    if edition.work.editions.count() > cluster_size:
        return edition.work
    for other_isbn in thingisbn(isbn):
        # 979's come back as 13
-        logger.debug("other_isbn: %s", other_isbn)
+        logger.debug(u"other_isbn: %s", other_isbn)
        if len(other_isbn) == 10:
            other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
        related_edition = add_by_isbn(other_isbn, work=edition.work)
@ -427,7 +427,7 @@ def relate_isbn(isbn, cluster_size=1):
                    related_edition.work = edition.work
                    related_edition.save()
                elif related_edition.work_id != edition.work_id:
-                    logger.debug("merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
+                    logger.debug(u"merge_works path 1 %s %s", edition.work_id, related_edition.work_id)
                    merge_works(related_edition.work, edition.work)
                if related_edition.work.editions.count() > cluster_size:
                    return related_edition.work
@ -438,7 +438,7 @@ def add_related(isbn):
    The initial seed ISBN will be added if it's not already there.
    """
    # make sure the seed edition is there
-    logger.info("adding related editions for %s", isbn)
+    logger.info(u"adding related editions for %s", isbn)

    new_editions = []

@ -446,14 +446,14 @@ def add_related(isbn):
    if edition is None:
        return new_editions
    if edition.work is None:
-        logger.warning("didn't add related to null work")
+        logger.warning(u"didn't add related to null work")
        return new_editions
    # this is the work everything will hang off
    work = edition.work
    other_editions = {}
    for other_isbn in thingisbn(isbn):
        # 979's come back as 13
-        logger.debug("other_isbn: %s", other_isbn)
+        logger.debug(u"other_isbn: %s", other_isbn)
        if len(other_isbn) == 10:
            other_isbn = regluit.core.isbn.convert_10_to_13(other_isbn)
        related_edition = add_by_isbn(other_isbn, work=work)
@ -466,7 +466,7 @@ def add_related(isbn):
                    related_edition.work = work
                    related_edition.save()
                elif related_edition.work_id != work.id:
-                    logger.debug("merge_works path 1 %s %s", work.id, related_edition.work_id)
+                    logger.debug(u"merge_works path 1 %s %s", work.id, related_edition.work_id)
                    work = merge_works(work, related_edition.work)
            else:
                if other_editions.has_key(related_language):
@ -476,14 +476,14 @@ def add_related(isbn):

    # group the other language editions together
    for lang_group in other_editions.itervalues():
-        logger.debug("lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
+        logger.debug(u"lang_group (ed, work): %s", [(ed.id, ed.work_id) for ed in lang_group])
        if len(lang_group) > 1:
            lang_edition = lang_group[0]
-            logger.debug("lang_edition.id: %s", lang_edition.id)
+            logger.debug(u"lang_edition.id: %s", lang_edition.id)
            # compute the distinct set of works to merge into lang_edition.work
            works_to_merge = set([ed.work for ed in lang_group[1:]]) - set([lang_edition.work])
            for w in works_to_merge:
-                logger.debug("merge_works path 2 %s %s", lang_edition.work_id, w.id)
+                logger.debug(u"merge_works path 2 %s %s", lang_edition.work_id, w.id)
                merged_work = merge_works(lang_edition.work, w)
        models.WorkRelation.objects.get_or_create(
            to_work=lang_group[0].work,
@ -498,17 +498,21 @@ def thingisbn(isbn):
    Library Thing. (takes isbn_10 or isbn_13, returns isbn_10, except for 979 isbns,
    which come back as isbn_13')
    """
-    logger.info("looking up %s at ThingISBN", isbn)
+    logger.info(u"looking up %s at ThingISBN", isbn)
    url = "https://www.librarything.com/api/thingISBN/%s" % isbn
    xml = requests.get(url, headers={"User-Agent": settings.USER_AGENT}).content
-    doc = ElementTree.fromstring(xml)
-    return [e.text for e in doc.findall('isbn')]
+    try:
+        doc = ElementTree.fromstring(xml)
+        return [e.text for e in doc.findall('isbn')]
+    except SyntaxError:
+        # LibraryThing down
+        return []


 def merge_works(w1, w2, user=None):
    """will merge the second work (w2) into the first (w1)
    """
-    logger.info("merging work %s into %s", w2.id, w1.id)
+    logger.info(u"merging work %s into %s", w2.id, w1.id)
    # don't merge if the works are the same or at least one of the works has no id
    #(for example, when w2 has already been deleted)
    if w1 is None or w2 is None or w1.id == w2.id or w1.id is None or w2.id is None:
@ -583,7 +587,7 @@ def detach_edition(e):
    will detach edition from its work, creating a new stub work. if remerge=true, will see if
    there's another work to attach to
    """
-    logger.info("splitting edition %s from %s", e, e.work)
+    logger.info(u"splitting edition %s from %s", e, e.work)
    w = models.Work(title=e.title, language=e.work.language)
    w.save()

@ -618,7 +622,7 @@ def add_openlibrary(work, hard_refresh=False):
    work.save()

    # find the first ISBN match in OpenLibrary
-    logger.info("looking up openlibrary data for work %s", work.id)
+    logger.info(u"looking up openlibrary data for work %s", work.id)

    e = None # openlibrary edition json
    w = None # openlibrary work json
@ -633,7 +637,7 @@ def add_openlibrary(work, hard_refresh=False):
        try:
            e = _get_json(url, params, type='ol')
        except LookupFailure:
-            logger.exception("OL lookup failed for  %s", isbn_key)
+            logger.exception(u"OL lookup failed for  %s", isbn_key)
            e = {}
        if e.has_key(isbn_key):
            if e[isbn_key].has_key('details'):
@ -673,7 +677,7 @@ def add_openlibrary(work, hard_refresh=False):
                        )
                if e[isbn_key]['details'].has_key('works'):
                    work_key = e[isbn_key]['details']['works'].pop(0)['key']
-                    logger.info("got openlibrary work %s for isbn %s", work_key, isbn_key)
+                    logger.info(u"got openlibrary work %s for isbn %s", work_key, isbn_key)
                    models.Identifier.get_or_add(type='olwk', value=work_key, work=work)
                    try:
                        w = _get_json("https://openlibrary.org" + work_key, type='ol')
@ -691,14 +695,14 @@ def add_openlibrary(work, hard_refresh=False):
                        if w.has_key('subjects') and len(w['subjects']) > len(subjects):
                            subjects = w['subjects']
                    except LookupFailure:
-                        logger.exception("OL lookup failed for  %s", work_key)
+                        logger.exception(u"OL lookup failed for  %s", work_key)
    if not subjects:
-        logger.warn("unable to find work %s at openlibrary", work.id)
+        logger.warn(u"unable to find work %s at openlibrary", work.id)
        return

    # add the subjects to the Work
    for s in subjects:
-        logger.info("adding subject %s to work %s", s, work.id)
+        logger.info(u"adding subject %s to work %s", s, work.id)
        subject = models.Subject.set_by_name(s, work=work)

    work.save()
@ -716,9 +720,9 @@ def _get_json(url, params={}, type='gb'):
    if response.status_code == 200:
        return json.loads(response.content)
    else:
-        logger.error("unexpected HTTP response: %s", response)
+        logger.error(u"unexpected HTTP response: %s", response)
        if response.content:
-            logger.error("response content: %s", response.content)
+            logger.error(u"response content: %s", response.content)
        raise LookupFailure("GET failed: url=%s and params=%s" % (url, params))


@ -766,7 +770,7 @@ def load_gutenberg_edition(title, gutenberg_etext_id, ol_work_id, seed_isbn, url
        ebook = models.Ebook()

    if len(ebooks) > 1:
-        logger.warning("There is more than one Ebook matching url {0}".format(url))
+        logger.warning(u"There is more than one Ebook matching url {0}".format(url))


    ebook.format = format
@ -826,8 +830,6 @@ def edition_for_etype(etype, metadata, default=None):
        for key in metadata.edition_identifiers.keys():
            return edition_for_ident(key, metadata.identifiers[key])

-MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
-
 def load_ebookfile(url, etype):
    '''
    return a ContentFile if a new ebook has been loaded
@ -960,8 +962,7 @@ class BasePandataLoader(object):
                    if contentfile:
                        contentfile_name = '/loaded/ebook_{}.{}'.format(edition.id, key)
                        path = default_storage.save(contentfile_name, contentfile)
-                        lic = MATCH_LICENSE.search(metadata.rights_url)
-                        license = 'CC {}'.format(lic.group(1).upper()) if lic else ''
+                        license = cc.license_from_cc_url(metadata.rights_url)
                        ebf = models.EbookFile.objects.create(
                            format=key,
                            edition=edition,
--- a/core/cc.py
+++ b/core/cc.py
@ -1,8 +1,11 @@
 # coding=utf-8
-# mostly constants related to Creative Commons
+''' mostly constants related to Creative Commons
 # let's be DRY with these parameters

 ## need to add versioned CC  entries
+'''
+
+import re

 INFO_CC = (
    ('CC BY-NC-ND', 'by-nc-nd', 'Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported (CC BY-NC-ND 3.0)', 'https://creativecommons.org/licenses/by-nc-nd/3.0/', 'Creative Commons Attribution-NonCommercial-NoDerivs'),     
@ -162,3 +165,15 @@ def match_license(license_string):
    except ValueError:
        pass
    return RIGHTS_ALIAS.get(license_string, None)
+
+MATCH_LICENSE = re.compile(r'creativecommons.org/licenses/([^/]+)/')
+def license_from_cc_url(rights_url):
+    if not rights_url:
+        return None
+    lic = MATCH_LICENSE.search(rights_url)
+    if lic:
+        return 'CC {}'.format(lic.group(1).upper())
+    if rights_url.find('openedition.org') >= 0:
+        return 'OPENEDITION'
+    return ''
+
--- a/core/loaders/init.py
+++ b/core/loaders/init.py
@ -52,3 +52,9 @@ def add_by_webpage(url, work=None, user=None):
        
 def add_by_sitemap(url, maxnum=None):
    return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
+    
+def scrape_language(url):
+    scraper = get_scraper(url)
+    return scraper.metadata.get('language')
+
+
--- a/core/loaders/doab.py
+++ b/core/loaders/doab.py
@ -1,42 +1,54 @@
 #!/usr/bin/env python
 # encoding: utf-8
-import logging
+import datetime
 import json
+import logging
 import re

-from itertools import islice
-
 import requests

-from django.db.models import (Q, F)
+from django.db.models import Q

-from django.core.files.storage import default_storage
 from django.core.files.base import ContentFile
+from django.core.files.storage import default_storage

-import regluit
+from oaipmh.client import Client
+from oaipmh.error import IdDoesNotExistError
+from oaipmh.metadata import MetadataRegistry, oai_dc_reader
+
+from regluit.core import bookloader, cc
 from regluit.core import models, tasks
-from regluit.core import bookloader
-from regluit.core.bookloader import add_by_isbn, merge_works
+from regluit.core.bookloader import merge_works
 from regluit.core.isbn import ISBN
+from regluit.core.loaders.utils import type_for_url
 from regluit.core.validation import valid_subject

+from . import scrape_language
+from .doab_utils import doab_lang_to_iso_639_1, online_to_download, url_to_provider
+
 logger = logging.getLogger(__name__)

-springercover = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
+def unlist(alist):
+    if not alist:
+        return None
+    return alist[0]

+
+SPRINGER_COVER = re.compile(r'ftp.+springer\.de.+(\d{13}\.jpg)$', flags=re.U)
+SPRINGER_IMAGE = u'https://images.springer.com/sgw/books/medium/{}.jpg'
 def store_doab_cover(doab_id, redo=False):
-    
+
    """
    returns tuple: 1) cover URL, 2) whether newly created (boolean)
    """
-    
-    cover_file_name= '/doab/%s/cover' % (doab_id)
-    
+
+    cover_file_name = '/doab/%s/cover' % (doab_id)
+
    # if we don't want to redo and the cover exists, return the URL of the cover
-    
+
    if not redo and default_storage.exists(cover_file_name):
        return (default_storage.url(cover_file_name), False)
-        
+
    # download cover image to cover_file
    url = "http://www.doabooks.org/doab?func=cover&rid={0}".format(doab_id)
    try:
@ -44,16 +56,16 @@ def store_doab_cover(doab_id, redo=False):
        if r.status_code == 302:
            redirurl = r.headers['Location']
            if redirurl.startswith(u'ftp'):
-                springerftp = springercover.match(redirurl)
+                springerftp = SPRINGER_COVER.match(redirurl)
                if springerftp:
-                    redirurl = u'https://images.springer.com/sgw/books/medium/{}.jpg'.format(springerftp.groups(1))
+                    redirurl = SPRINGER_IMAGE.format(springerftp.groups(1))
                    r = requests.get(redirurl)
        else:
-            r = requests.get(url)    
+            r = requests.get(url)
        cover_file = ContentFile(r.content)
        cover_file.content_type = r.headers.get('content-type', '')

-        path = default_storage.save(cover_file_name, cover_file)    
+        default_storage.save(cover_file_name, cover_file)
        return (default_storage.url(cover_file_name), True)
    except Exception, e:
        # if there is a problem, return None for cover URL
@ -74,52 +86,51 @@ def update_cover_doab(doab_id, edition, store_cover=True):
        edition.cover_image = cover_url
        edition.save()
        return cover_url
-    else:
-        return None
-    
+    return None
+
 def attach_more_doab_metadata(edition, description, subjects,
                              publication_date, publisher_name=None, language=None, authors=u''):
-    
+
    """
    for given edition, attach description, subjects, publication date to
    corresponding Edition and Work
    """
-    # if edition doesn't have a publication date, update it    
+    # if edition doesn't have a publication date, update it
    if not edition.publication_date:
        edition.publication_date = publication_date
-    
+
    # if edition.publisher_name is empty, set it
    if not edition.publisher_name:
        edition.set_publisher(publisher_name)
-        
+
    edition.save()
-        
+
    # attach description to work if it's not empty
    work = edition.work
    if not work.description:
        work.description = description
-        
+
    # update subjects
    for s in subjects:
        if valid_subject(s):
            models.Subject.set_by_name(s, work=work)
-    
+
    # set reading level of work if it's empty; doab is for adults.
    if not work.age_level:
        work.age_level = '18-'
-        
-    if language:
+
+    if language and language != 'xx':
        work.language = language
    work.save()
-    
+
    if authors and authors == authors: # test for authors != NaN
        authlist = creator_list(authors)
        if edition.authors.all().count() < len(authlist):
            edition.authors.clear()
            if authlist is not None:
-                for [rel,auth] in authlist:
+                for [rel, auth] in authlist:
                    edition.add_author(auth, rel)
-               
+
    return edition

 def add_all_isbns(isbns, work, language=None, title=None):
@ -128,69 +139,73 @@ def add_all_isbns(isbns, work, language=None, title=None):
        first_edition = None
        edition = bookloader.add_by_isbn(isbn, work, language=language, title=title)
        if edition:
-            first_edition = first_edition if first_edition else edition 
-            if work and (edition.work_id != work.id): 
+            first_edition = first_edition if first_edition else edition
+            if work and (edition.work_id != work.id):
                if work.created < edition.work.created:
                    work = merge_works(work, edition.work)
                else:
                    work = merge_works(edition.work, work)
            else:
                work = edition.work
-    return first_edition 
+    return first_edition

 def load_doab_edition(title, doab_id, url, format, rights,
                      language, isbns,
                      provider, **kwargs):
-    
+
    """
    load a record from doabooks.org represented by input parameters and return an ebook
    """
+    logger.info('load doab {} {} {} {} {}'.format(doab_id, format, rights, language, provider))
    if language and isinstance(language, list):
        language = language[0]
-        
+    if language == 'xx' and format == 'online':
+        language = scrape_language(url)
    # check to see whether the Edition hasn't already been loaded first
    # search by url
    ebooks = models.Ebook.objects.filter(url=url)
-       
+
    # 1 match
    # > 1 matches
    # 0 match

    # simplest case -- if match (1 or more), we could check whether any
    # ebook.edition.work has a doab id matching given doab_id
-    
+
    # put a migration to force Ebook.url to be unique id
-    
+
    # if yes, then return one of the Edition(s) whose work is doab_id
-    # if no, then 
+    # if no, then
    ebook = None
    if len(ebooks) > 1:
-        raise Exception("There is more than one Ebook matching url {0}".format(url))    
-    elif len(ebooks) == 1:  
+        raise Exception("There is more than one Ebook matching url {0}".format(url))
+    elif len(ebooks) == 1:
        ebook = ebooks[0]
-        doab_identifer = models.Identifier.get_or_add(type='doab',value=doab_id, 
-                                               work=ebook.edition.work)
-        # update the cover id 
+        doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
+                                                      work=ebook.edition.work)
+        # update the cover id
        cover_url = update_cover_doab(doab_id, ebook.edition)
-        
+
        # attach more metadata
-        attach_more_doab_metadata(ebook.edition, 
-                                  description=kwargs.get('description'),
-                                  subjects=kwargs.get('subject'),
-                                  publication_date=kwargs.get('date'),
-                                  publisher_name=kwargs.get('publisher'),
-                                  language=language,
-                                  authors=kwargs.get('authors'),)
+        attach_more_doab_metadata(
+            ebook.edition,
+            description=unlist(kwargs.get('description')),
+            subjects=kwargs.get('subject'),
+            publication_date=unlist(kwargs.get('date')),
+            publisher_name=unlist(kwargs.get('publisher')),
+            language=language,
+            authors=kwargs.get('creator'),
+        )
        # make sure all isbns are added
-        add_all_isbns(isbns, None, language=language, title=title)
-        return ebook
-    
+        add_all_isbns(isbns, ebook.edition.work, language=language, title=title)
+        return ebook.edition
+
    # remaining case --> no ebook, load record, create ebook if there is one.
-    assert len(ebooks) == 0
-            
+    assert not ebooks
+

    # we need to find the right Edition/Work to tie Ebook to...
-        
+
    # look for the Edition with which to associate ebook.
    # loop through the isbns to see whether we get one that is not None
    work = None
@ -206,16 +221,16 @@ def load_doab_edition(title, doab_id, url, format, rights,
            edition = ident.work.preferred_edition
            work = edition.work
            break
-    
+
    if edition is not None:
        # if this is a new edition, then add related editions asynchronously
-        if getattr(edition,'new', False):
+        if getattr(edition, 'new', False):
            tasks.populate_edition.delay(edition.isbn_13)
        doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
-                                work=edition.work)
+                                                      work=edition.work)

-    # we need to create Edition(s) de novo    
-    else: 
+    # we need to create Edition(s) de novo
+    else:
        # if there is a Work with doab_id already, attach any new Edition(s)
        try:
            work = models.Identifier.objects.get(type='doab', value=doab_id).work
@ -226,11 +241,11 @@ def load_doab_edition(title, doab_id, url, format, rights,
                work = models.Work(language='xx', title=title, age_level='18-')
            work.save()
            doab_identifer = models.Identifier.get_or_add(type='doab', value=doab_id,
-                                               work=work)
-            
+                                                          work=work)
+
        # if work has any ebooks already, attach the ebook to the corresponding edition
        # otherwise pick the first one
-        # pick the first edition as the one to tie ebook to 
+        # pick the first edition as the one to tie ebook to
        editions_with_ebooks = models.Edition.objects.filter(Q(work__id=work.id) & \
                                                      Q(ebooks__isnull=False)).distinct()
        if editions_with_ebooks:
@ -240,73 +255,41 @@ def load_doab_edition(title, doab_id, url, format, rights,
        else:
            edition = models.Edition(work=work, title=title)
            edition.save()
-        
+
    # make the edition the selected_edition of the work
    work.selected_edition = edition
    work.save()
-    
-    if format in ('pdf', 'epub', 'mobi'):
+
+    if format in ('pdf', 'epub', 'mobi', 'html', 'online'):
        ebook = models.Ebook()
        ebook.format = format
        ebook.provider = provider
-        ebook.url =  url
+        ebook.url = url
        ebook.rights = rights
        # tie the edition to ebook
        ebook.edition = edition
+        if format == "online":
+            ebook.active = False
        ebook.save()
-    
+
    # update the cover id (could be done separately)
    cover_url = update_cover_doab(doab_id, edition)
-    
+
    # attach more metadata
-    attach_more_doab_metadata(edition, 
-                              description=kwargs.get('description'),
-                              subjects=kwargs.get('subject'),
-                              publication_date=kwargs.get('date'),
-                              publisher_name=kwargs.get('publisher'),
-                              authors=kwargs.get('authors'),)    
-    return ebook
+    attach_more_doab_metadata(
+        edition,
+        description=unlist(kwargs.get('description')),
+        subjects=kwargs.get('subject'),
+        publication_date=unlist(kwargs.get('date')),
+        publisher_name=unlist(kwargs.get('publisher')),
+        authors=kwargs.get('creator'),
+    )
+    return edition

-
-def load_doab_records(fname, limit=None):
-    
-    success_count = 0
-    ebook_count = 0
-    
-    records = json.load(open(fname))
-
-    for (i, book) in enumerate(islice(records,limit)):
-        d = dict(book)
-        d['isbns'] = split_isbns(d['isbns_raw']) # use stricter isbn string parsing.
-        try:
-            ebook = load_doab_edition(**d)
-            success_count += 1 
-            if ebook:
-                ebook_count +=1
-        except Exception, e:
-            logger.error(e)
-            logger.error(book)
-            
-    logger.info("Number of records processed: " + str(success_count))
-    logger.info("Number of ebooks processed: " + str(ebook_count))
-
-"""
+#
 #tools to parse the author lists in doab.csv
-from pandas import DataFrame
-url = "http://www.doabooks.org/doab?func=csv"
-df_csv = DataFrame.from_csv(url)
+#

-out=[]
-for val in df_csv.values:
-    isbn = split_isbns(val[0])
-    if isbn:
-        auths = []
-        if val[2] == val[2] and val[-2] == val[-2]: # test for NaN auths and licenses
-            auths = creator_list(val[2])
-            out.append(( isbn[0], auths))
-open("/Users/eric/doab_auths.json","w+").write(json.dumps(out,indent=2, separators=(',', ': ')))
-"""
-    
 au = re.compile(r'\(Authors?\)', flags=re.U)
 ed = re.compile(r'\([^\)]*(dir.|[Eeé]ds?.|org.|coord.|Editor|a cura di|archivist)[^\)]*\)', flags=re.U)
 tr = re.compile(r'\([^\)]*([Tt]rans.|tr.|translated by)[^\)]*\)', flags=re.U)
@ -326,14 +309,14 @@ def fnf(auth):
    if len(parts) == 1:
        return  parts[0].strip()
    elif len(parts) == 2:
-        return u'{} {}'.format(parts[1].strip(),parts[0].strip())
+        return u'{} {}'.format(parts[1].strip(), parts[0].strip())
    else:
-        if parts[1].strip() in ('der','van', 'von', 'de', 'ter'):
-            return u'{} {} {}'.format(parts[2].strip(),parts[1].strip(),parts[0].strip())
+        if parts[1].strip() in ('der', 'van', 'von', 'de', 'ter'):
+            return u'{} {} {}'.format(parts[2].strip(), parts[1].strip(), parts[0].strip())
        #print auth
        #print re.search(namelist,auth).group(0)
-        return u'{} {}, {}'.format(parts[2].strip(),parts[0].strip(),parts[1].strip())
-    
+        return u'{} {}, {}'.format(parts[2].strip(), parts[0].strip(), parts[1].strip())
+

 def creator(auth, editor=False):
    auth = auth.strip()
@ -349,68 +332,88 @@ def creator(auth, editor=False):
        return [u'dsr', fnf(ds.sub(u'', auth))]
    if re.search(cm, auth):
        return [u'com', fnf(cm.sub(u'', auth))]
-    
+
    auth = au.sub('', auth)
    return ['aut', fnf(auth)]

-def split_auths(auths):
-    if ';' in auths or '/' in auths:
-        return namesep2.split(auths)
-    else:
-        nl = namelist.match(auths.strip())
-        if nl:
-            if nl.group(3).endswith(' de') \
-                or ' de ' in nl.group(3) \
-                or nl.group(3).endswith(' da') \
-                or nl.group(1).endswith(' Jr.') \
-                or ' e ' in nl.group(1):
-                return [auths]
-            else:
-                return namesep.split(auths)
-        else :
-            return [auths]
-
-def split_isbns(isbns):
-    result = []
-    for isbn in isbnsep.split(isbns):
-        isbn = ISBN(isbn)
-        if isbn.valid:
-            result.append(isbn.to_string())
-    return result
-
 def creator_list(creators):
    auths = []
-    if re.search(edlist, creators):
-        for auth in split_auths(edlist.sub(u'', creators)):
-            if auth:
-                auths.append(creator(auth, editor=True))
-    else:
-        for auth in split_auths(unicode(creators)):
-            if auth:
-                auths.append(creator(auth))
+    for auth in creators:
+        auths.append(creator(auth))
    return auths

-def load_doab_auths(fname, limit=None):
-    doab_auths = json.load(open(fname))
-    recnum = 0
-    failed = 0
-    for [isbnraw, authlist] in doab_auths:
-        isbn = ISBN(isbnraw).to_string()
-        try:
-            work = models.Identifier.objects.get(type='isbn',value=isbn).work
-        except models.Identifier.DoesNotExist:
-            print 'isbn = {} not found'.format(isbnraw)
-            failed += 1
-        if work.preferred_edition.authors.all().count() < len(authlist):
-            work.preferred_edition.authors.clear()
-            if authlist is None:
-                print "null authlist; isbn={}".format(isbn)
+DOAB_OAIURL = 'https://www.doabooks.org/oai'
+DOAB_PATT = re.compile(r'[\./]doabooks\.org/doab\?.*rid:(\d{1,8}).*')
+mdregistry = MetadataRegistry()
+mdregistry.registerReader('oai_dc', oai_dc_reader)
+doab_client = Client(DOAB_OAIURL, mdregistry)
+
+def add_by_doab(doab_id, record=None):
+    try:
+        record = record if record else doab_client.getRecord(
+            metadataPrefix='oai_dc',
+            identifier='oai:doab-books:{}'.format(doab_id)
+        )
+        metadata = record[1].getMap()
+        isbns = []
+        url = None
+        for ident in metadata.pop('identifier', []):
+            if ident.startswith('ISBN: '):
+                isbn = ISBN(ident[6:])
+                if isbn.error:
+                    continue
+                isbn.validate()
+                isbns.append(isbn.to_string())
+            elif ident.find('doabooks.org') >= 0:
+                # should already know the doab_id
                continue
-            for [rel,auth] in authlist:
-                work.preferred_edition.add_author(auth, rel)
-        recnum +=1
-        if limit and recnum > limit:
-            break          
-    logger.info("Number of records processed: " + str(recnum))
-    logger.info("Number of missing isbns: " + str(failed))
-        
+            else:
+                url = ident
+        language = doab_lang_to_iso_639_1(unlist(metadata.pop('language', None)))
+        urls = online_to_download(url)
+        edition = None
+        for dl_url in urls:
+            format = type_for_url(dl_url)
+            if 'format' in metadata:
+                del metadata['format']
+            edition = load_doab_edition(
+                unlist(metadata.pop('title', None)),
+                doab_id,
+                dl_url,
+                format,
+                cc.license_from_cc_url(unlist(metadata.pop('rights', None))),
+                language,
+                isbns,
+                url_to_provider(dl_url) if dl_url else None,
+                **metadata
+            )
+        return edition
+    except IdDoesNotExistError:
+        return None
+
+
+def getdoab(url):
+    id_match = DOAB_PATT.search(url)
+    if id_match:
+        return id_match.group(1)
+    return False
+
+def load_doab_oai(from_year=2000, limit=100000):
+    '''
+    use oai feed to get oai updates
+    '''
+    from_ = datetime.datetime(year=from_year, month=1, day=1)
+    doab_ids = []
+    for record in doab_client.listRecords(metadataPrefix='oai_dc', from_=from_):
+        if not record[1]:
+            continue
+        idents = record[1].getMap()['identifier']
+        if idents:
+            for ident in idents:
+                doab = getdoab(ident)
+                if doab:
+                    doab_ids.append(doab)
+                    e = add_by_doab(doab, record=record)
+                    logger.info(u'updated:\t{}\t{}'.format(doab, e.title))
+        if len(doab_ids) > limit:
+            break
--- a/core/loaders/doab_utils.py
+++ b/core/loaders/doab_utils.py
@ -0,0 +1,126 @@
+"""
+doab_utils.py
+
+"""
+
+import re
+import urlparse
+
+import requests
+
+from regluit.utils.lang import get_language_code
+from .utils import get_soup
+
+# utility functions for converting lists of individual items into individual items
+
+# let's do a mapping of the DOAB languages into the language codes used 
+# mostly, we just handle mispellings
+# also null -> xx
+
+EXTRA_LANG_MAP = dict([
+    (u'chinese', 'de'),
+    (u'deutsch', 'de'),
+    (u'eng', 'en'),
+    (u'englilsh', 'en'),
+    (u'englilsh', 'en'),
+    (u'englisch', 'en'),
+    (u'espanol', 'es'),
+    (u'ger', 'de'),
+    (u'fra', 'fr'),
+    (u'fre', 'fr'),
+    (u'francese', 'fr'),
+    (u'ita', 'it'),
+    (u'italiano', 'it'),
+    (u'norwegian', 'no'),
+    (u'por', 'pt'),
+    (u'portugese', 'pt'),
+    (u'slovene', 'sl'),
+    (u'spa', 'es'),
+    (u'spagnolo', 'es'),
+])
+
+sep = re.compile(r'[ \-;^,/]+')
+def doab_lang_to_iso_639_1(lang):
+    if lang is None or not lang:
+        return "xx"
+    else:
+        lang = sep.split(lang)[0]
+        code = get_language_code(lang)
+        if code:
+            return code
+        else:
+            return EXTRA_LANG_MAP.get(lang.lower(), 'xx')
+
+
+DOMAIN_TO_PROVIDER = dict([
+    [u'www.doabooks.org', u'Directory of Open Access Books'],
+    [u'www.oapen.org', u'OAPEN Library'],
+    [u'books.openedition.org', u'OpenEdition Books'],
+    [u'digitalcommons.usu.edu', u'DigitalCommons, Utah State University'],
+    [u'www.aupress.ca', u'Athabasca University Press'],
+    [u'dspace.ucalgary.ca', u'Institutional Repository at the University of Calgary'],
+    [u'www.degruyter.com', u'De Gruyter Online'],
+    [u'dx.doi.org', u'DOI Resolver'],
+    [u'www.openbookpublishers.com', u'Open Book Publishers'],
+    [u'www.adelaide.edu.au', u'University of Adelaide'],
+    [u'hdl.handle.net', u'Handle Proxy'],
+    [u'link.springer.com', u'Springer'],
+    [u'www.bloomsburyacademic.com', u'Bloomsbury Academic'],
+    [u'www.ledizioni.it', u'Ledizioni'],
+    [u'ccdigitalpress.org', u'Computers and Composition Digital Press'],
+    [u'leo.cilea.it', u'LEO '],
+    [u'www.springerlink.com', u'Springer'],
+    [u'www.palgraveconnect.com', u'Palgrave Connect'],
+    [u'www.ubiquitypress.com', u'Ubiquity Press'],
+    [u'ebooks.iospress.nl', u'IOS Press Ebooks'],
+    [u'antropologie.zcu.cz', u'AntropoWeb'],
+    [u'www.unito.it', u"University of Turin"],
+    [u'leo.cineca.it', u'Letteratura Elettronica Online'],
+    [u'hw.oeaw.ac.at', u'Austrian Academy of Sciences'],
+    [u'www.co-action.net', u'Co-Action Publishing'],
+    [u'www.aliprandi.org', u'Simone Aliprandi'],
+    [u'www.maestrantonella.it', u'maestrantonella.it'],
+    [u'www.antilia.to.it', u'antilia.to.it'],
+    [u'www.scribd.com', u'Scribd'],
+    [u'ledibooks.com', u'LediBooks'],
+    [u'press.openedition.org', u'OpenEdition Press'],
+    [u'oapen.org', u'OAPEN Library'],
+    [u'www.ebooks.iospress.nl', u'IOS Press Ebooks'],
+    [u'windsor.scholarsportal.info', u'Scholars Portal'],
+    [u'www.unimib.it', u'University of Milano-Bicocca'],
+    [u'books.mdpi.com', u'MDPI Books'],
+    [u'www.dropbox.com', u'Dropbox'],
+    [u'dl.dropboxusercontent.com', u'Dropbox'],
+])
+
+def url_to_provider(url):
+    netloc = urlparse.urlparse(url).netloc
+    return DOMAIN_TO_PROVIDER.get(netloc, netloc)
+
+FRONTIERSIN = re.compile(r'frontiersin.org/books/[^/]+/(\d+)')
+
+def online_to_download(url):
+    urls = []
+    if url.find(u'mdpi.com/books/pdfview/book/') >= 0:
+        doc = get_soup(url)
+        if doc:
+            obj = doc.find('object', type='application/pdf')
+            if obj:
+                urls.append(obj['data'].split('#')[0])
+    elif url.find(u'books.scielo.org/') >= 0:
+        doc = get_soup(url)
+        if doc:
+            obj = doc.find('a', class_='pdf_file')
+            if obj:
+                urls.append(urlparse.urljoin(url, obj['href']))
+            obj = doc.find('a', class_='epub_file')
+            if obj:
+                urls.append(urlparse.urljoin(url, obj['href']))
+    elif FRONTIERSIN.search(url):
+        booknum = FRONTIERSIN.search(url).group(1)
+        urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=EPUB'.format(booknum))
+        urls.append(u'https://www.frontiersin.org/GetFile.aspx?ebook={}&fileformat=PDF'.format(booknum))
+    else:
+        urls.append(url)
+    return urls
+
--- a/core/loaders/tests.py
+++ b/core/loaders/tests.py
@ -0,0 +1,28 @@
+from django.conf import settings
+from django.test import TestCase
+from regluit.core.models import Ebook, Edition, Work
+from .utils import dl_online
+
+class LoaderTests(TestCase):
+    def setUp(self):
+        pass
+
+    def test_downloads(self):
+        if not (settings.TEST_INTEGRATION):
+            return
+
+        work = Work(title="online work")
+        work.save()
+
+        edition = Edition(work=work)
+        edition.save()
+
+        dropbox_url = 'https://www.dropbox.com/s/h5jzpb4vknk8n7w/Jakobsson_The_Troll_Inside_You_EBook.pdf?dl=0'
+        dropbox_ebook = Ebook.objects.create(format='online', url=dropbox_url, edition=edition)
+        dropbox_ebf = dl_online(dropbox_ebook)
+        self.assertTrue(dropbox_ebf.ebook.filesize)
+
+        jbe_url = 'http://www.jbe-platform.com/content/books/9789027295958'
+        jbe_ebook = Ebook.objects.create(format='online', url=jbe_url, edition=edition)
+        jbe_ebf = dl_online(jbe_ebook)
+        self.assertTrue(jbe_ebf.ebook.filesize)
--- a/core/loaders/utils.py
+++ b/core/loaders/utils.py
@ -1,15 +1,24 @@
 import csv
-import re
-import requests
 import logging
+import re
 import sys
+import time
 import unicodedata
+import urlparse
+
+from bs4 import BeautifulSoup
+import requests
+
+from django.conf import settings
+from django.core.files.base import ContentFile

-from regluit.core.models import Work, Edition, Author, PublisherName, Identifier, Subject
-from regluit.core.isbn import ISBN
-from regluit.core.bookloader import add_by_isbn_from_google, merge_works
 from regluit.api.crosswalks import inv_relator_contrib
 from regluit.bisac.models import BisacHeading
+from regluit.core.bookloader import add_by_isbn_from_google, merge_works
+from regluit.core.isbn import ISBN
+from regluit.core.models import (
+    Author, Ebook, EbookFile, Edition, Identifier, path_for_file, PublisherName, Subject, Work,
+)

 logger = logging.getLogger(__name__)

@ -34,6 +43,12 @@ def utf8_general_ci_norm(s):
    s1 = unicodedata.normalize('NFD', s)
    return ''.join(c for c in s1 if not unicodedata.combining(c)).upper()

+def get_soup(url):
+    response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
+    if response.status_code == 200:
+        return BeautifulSoup(response.content, 'lxml')
+    return None
+
 def get_authors(book):
    authors=[]
    if book.get('AuthorsList',''):
@ -331,14 +346,15 @@ def loaded_book_ok(book, work, edition):
    return True

 ID_URLPATTERNS = {
-    'goog': re.compile(r'[\./]google\.com/books\?.*id=([a-zA-Z0-9\-_]{12})'),
-    'olwk': re.compile(r'[\./]openlibrary\.org(/works/OL\d{1,8}W)'),
-    'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(\d{1,8})'),
-    'ltwk': re.compile(r'[\./]librarything\.com/work/(\d{1,8})'),
-    'oclc': re.compile(r'\.worldcat\.org/.*oclc/(\d{8,12})'),
-    'doi': re.compile(r'[\./]doi\.org/(10\.\d+/\S+)'),
-    'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(\d{1,6})'),
-    'glue': re.compile(r'[\./]unglue\.it/work/(\d{1,7})'),
+    'goog': re.compile(r'[\./]google\.com/books\?.*id=(?P<id>[a-zA-Z0-9\-_]{12})'),
+    'olwk': re.compile(r'[\./]openlibrary\.org(?P<id>/works/OL\d{1,8}W)'),
+    'doab': re.compile(r'([\./]doabooks\.org/doab\?.*rid:|=oai:doab-books:)(?P<id>\d{1,8})'),
+    'gdrd': re.compile(r'[\./]goodreads\.com/book/show/(?P<id>\d{1,8})'),
+    'ltwk': re.compile(r'[\./]librarything\.com/work/(?P<id>\d{1,8})'),
+    'oclc': re.compile(r'\.worldcat\.org/.*oclc/(?P<id>\d{8,12})'),
+    'doi': re.compile(r'[\./]doi\.org/(?P<id>10\.\d+/\S+)'),
+    'gtbg': re.compile(r'[\./]gutenberg\.org/ebooks/(?P<id>\d{1,6})'),
+    'glue': re.compile(r'[\./]unglue\.it/work/(?P<id>\d{1,7})'),
 }

 def ids_from_urls(url):
@ -346,7 +362,111 @@ def ids_from_urls(url):
    for ident in ID_URLPATTERNS.keys():
        id_match = ID_URLPATTERNS[ident].search(url)
        if id_match:
-            ids[ident] = id_match.group(1)
+            ids[ident] = id_match.group('id')
    return ids
        
-    
+DROPBOX_DL = re.compile(r'"(https://dl.dropboxusercontent.com/content_link/[^"]+)"')
+
+def dl_online(ebook):
+    if ebook.format != 'online':
+        return
+        
+    if ebook.url.find(u'dropbox.com/s/') >= 0:
+        response = requests.get(ebook.url, headers={"User-Agent": settings.USER_AGENT})
+        if response.status_code == 200:
+            match_dl = DROPBOX_DL.search(response.content)
+            if match_dl:
+                return make_dl_ebook(match_dl.group(1), ebook)
+    elif ebook.url.find(u'jbe-platform.com/content/books/') >= 0:
+        doc = get_soup(ebook.url)
+        if doc:
+            obj = doc.select_one('div.fulltexticoncontainer-PDF a')
+            if obj:
+                dl_url = urlparse.urljoin(ebook.url, obj['href'])
+                return make_dl_ebook(dl_url, ebook)
+                
+def make_dl_ebook(url, ebook):
+    if EbookFile.objects.filter(source=ebook.url):
+        return EbookFile.objects.filter(source=ebook.url)[0]
+    response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
+    if response.status_code == 200:
+        filesize = int(response.headers.get("Content-Length", 0))
+        filesize = filesize if filesize else None
+        format = type_for_url(url, content_type=response.headers.get('content-type'))
+        if format != 'online':
+            new_ebf = EbookFile.objects.create(
+                edition=ebook.edition,
+                format=format,
+                source=ebook.url,
+            )
+            new_ebf.file.save(path_for_file(new_ebf, None), ContentFile(response.content))
+            new_ebf.save()
+            new_ebook = Ebook.objects.create(
+                edition=ebook.edition,
+                format=format,
+                provider='Unglue.it',
+                url=new_ebf.file.url,
+                rights=ebook.rights,
+                filesize=filesize,
+                version_label=ebook.version_label,
+                version_iter=ebook.version_iter,
+            )
+            new_ebf.ebook = new_ebook
+            new_ebf.save()
+            return new_ebf
+
+def type_for_url(url, content_type=None):
+    if not url:
+        return ''
+    if url.find('books.openedition.org') >= 0:
+        return ('online')
+    ct = content_type if content_type else contenttyper.calc_type(url)
+    if re.search("pdf", ct):
+        return "pdf"
+    elif re.search("octet-stream", ct) and re.search("pdf", url, flags=re.I):
+        return "pdf"
+    elif re.search("octet-stream", ct) and re.search("epub", url, flags=re.I):
+        return "epub"
+    elif re.search("text/plain", ct):
+        return "text"
+    elif re.search("text/html", ct):
+        if url.find('oapen.org/view') >= 0:
+            return "html"
+        return "online"
+    elif re.search("epub", ct):
+        return "epub"
+    elif re.search("mobi", ct):
+        return "mobi"
+    return "other"
+   
+class ContentTyper(object):
+    """ """
+    def __init__(self):
+        self.last_call = dict()
+
+    def content_type(self, url):
+        try:
+            r = requests.head(url)
+            return r.headers.get('content-type')
+        except:
+            return None
+
+    def calc_type(self, url):
+        delay = 1
+        # is there a delay associated with the url
+        netloc = urlparse.urlparse(url).netloc
+
+        # wait if necessary
+        last_call = self.last_call.get(netloc)
+        if last_call is not None:
+            now = time.time()
+            min_time_next_call = last_call + delay
+            if min_time_next_call > now:
+                time.sleep(min_time_next_call-now)
+
+        self.last_call[netloc] = time.time()
+
+        # compute the content-type
+        return self.content_type(url)
+
+contenttyper = ContentTyper()
--- a/core/management/commands/doab_load_books.py
+++ b/core/management/commands/doab_load_books.py
@ -1,17 +0,0 @@
-import os
-
-from django.conf import settings
-from django.contrib.auth.models import User
-from django.core.management.base import BaseCommand
-
-from regluit.core.loaders import doab
-
-class Command(BaseCommand):
-    help = "load doab books"
-    args = "<limit> <file_name>"
-    
-    def handle(self, limit=None, file_name="../../../bookdata/doab.json", **options):
-
-        command_dir =  os.path.dirname(os.path.realpath(__file__))
-        file_path = os.path.join(command_dir, file_name)
-        doab.load_doab_records(file_path, limit=int(limit))
--- a/core/management/commands/harvest_online_ebooks.py
+++ b/core/management/commands/harvest_online_ebooks.py
@ -0,0 +1,21 @@
+from django.core.management.base import BaseCommand
+
+from regluit.core.loaders.utils import dl_online
+from regluit.core.models import Ebook
+
+class Command(BaseCommand):
+    help = "harvest downloadable ebooks from 'online' ebooks"
+    args = "<limit>"
+    
+    def handle(self, limit=0, **options):
+        limit = int(limit) if limit else 0
+        onlines = Ebook.objects.filter(format='online')
+        done = 0
+        for online in onlines:
+            new_ebf = dl_online(online)
+            if new_ebf:
+                done += 1
+                if done > limit:
+                    break
+        print 'harvested {} ebooks'.format(done)
+        
--- a/core/management/commands/load_by_doab.py
+++ b/core/management/commands/load_by_doab.py
@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+
+from regluit.core.loaders import doab
+
+class Command(BaseCommand):
+    help = "load doab books by doab_id via oai"
+    args = "<doab_id>"
+    
+    def handle(self, doab_id, **options):
+        doab.add_by_doab(doab_id)
--- a/core/management/commands/load_doab.py
+++ b/core/management/commands/load_doab.py
@ -0,0 +1,18 @@
+from django.core.management.base import BaseCommand
+
+from regluit.core.loaders import doab
+
+class Command(BaseCommand):
+    help = "load doab books via oai"
+    args = "<from_year> <limit>"
+    
+    def handle(self, from_year= None, limit=None, **options):
+        from_year = int(from_year) if from_year else None
+        limit = int(limit) if limit else None
+        if limit:
+            doab.load_doab_oai(from_year=from_year, limit=limit)
+        else:
+            if from_year:
+                doab.load_doab_oai(from_year=from_year)
+            else:
+                doab.load_doab_oai()
--- a/core/models/bibmodels.py
+++ b/core/models/bibmodels.py
@ -1083,7 +1083,7 @@ class EbookFile(models.Model):
            source=self.file.url
        )
            
-        new_mobi_ebf.file.save(path_for_file('ebf', None), mobi_cf)
+        new_mobi_ebf.file.save(path_for_file(new_mobi_ebf, None), mobi_cf)
        new_mobi_ebf.save()
        if self.ebook:
            new_ebook = Ebook.objects.create(
--- a/core/tests.py
+++ b/core/tests.py
--- a/frontend/views/bibedit.py
+++ b/frontend/views/bibedit.py
@ -21,6 +21,7 @@ from regluit.core.bookloader import (
 from regluit.core.parameters import WORK_IDENTIFIERS

 from regluit.core.loaders import add_by_webpage
+from regluit.core.loaders.doab import add_by_doab
 from regluit.core.loaders.utils import ids_from_urls
 from regluit.frontend.forms import EditionForm, IdentifierForm

@ -106,6 +107,11 @@ def get_edition_for_id(id_type, id_value, user=None):
        if edition:
            return user_edition(edition, user)
    
+    if identifiers.has_key('doab'):
+        edition = add_by_doab(identifiers['doab'])
+        if edition:
+            return user_edition(edition, user)
+    
    if identifiers.has_key('oclc'):
        edition = add_by_oclc(identifiers['oclc'])
        if edition:
--- a/requirements_versioned.pip
+++ b/requirements_versioned.pip
@ -45,7 +45,7 @@ html5lib==1.0b3
 httplib2==0.7.5
 isodate==0.5.1
 kombu==3.0.35
-lxml==2.3.5
+lxml==4.2.1
 defusedxml==0.4.1
 mechanize==0.2.5
 mimeparse==0.1.3
@ -58,6 +58,7 @@ paramiko==1.14.1
 postmonkey==1.0b
 pycrypto==2.6
 pymarc==3.0.2
+pyoai==2.5.0
 pyparsing==2.0.3
 python-dateutil==2.5.3
 python-mimeparse==0.1.4
@ -72,7 +73,7 @@ requests==2.10.0
 requests-mock==1.2.0
 requests-oauthlib==0.6.2
 selenium==2.53.1
-six==1.9.0
+six==1.11.0
 sorl-thumbnail==12.3
 ssh==1.7.14
 stevedore==1.12.0
--- a/utils/lang.py
+++ b/utils/lang.py
@ -1,6 +1,10 @@
 from django.conf.global_settings import LANGUAGES

 lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
+code2lang = dict(LANGUAGES)

 def get_language_code(language):
-    return lang2code.get(language.lower().strip(), '')
+    language = language.lower().strip()
+    if language in code2lang:
+        return language
+    return lang2code.get(language, '')