regluit/core/validation.py

# encoding: utf-8
'''
methods to validate and clean identifiers
'''
import re
import datetime

from dateutil.parser import parse
from PyPDF2 import PdfFileReader

from django.forms import ValidationError
from django.utils.translation import ugettext_lazy as _

from regluit.pyepub import EPUB
from regluit.mobi import Mobi
from .isbn import ISBN

ID_VALIDATION = {
    'http': (re.compile(r"(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$",
                        flags=re.IGNORECASE|re.S),
             "The Web Address must be a valid http(s) URL."),
    'isbn':  (r'^([\dxX\-–— ]+|delete)$',
              "The ISBN must be a valid ISBN-13."),
    'doab': (r'^(\d{1,6}|delete)$',
             "The value must be 1-6 digits."),
    'gtbg': (r'^(\d{1,6}|delete)$',
             "The Gutenberg number must be 1-6 digits."),
    'doi': (r'^(https?://dx\.doi\.org/|https?://doi\.org/)?(10\.\d+/\S+|delete)$',
            "The DOI value must be a valid DOI."),
    'oclc': (r'^(\d{8,12}|delete)$',
             "The OCLCnum must be 8 or more digits."),
    'goog': (r'^([a-zA-Z0-9\-_]{12}|delete)$',
             "The Google id must be 12 alphanumeric characters, dash or underscore."),
    'gdrd': (r'^(\d{1,8}|delete)$',
             "The Goodreads ID must be 1-8 digits."),
    'thng': (r'(^\d{1,8}|delete)$',
             "The LibraryThing ID must be 1-8 digits."),
    'olwk': (r'^(/works/\)?OLd{1,8}W|delete)$',
             "The Open Library Work ID looks like 'OL####W'."),
    'glue': (r'^(\d{1,6}|delete)$',
             "The Unglue.it ID must be 1-6 digits."),
    'ltwk': (r'^(\d{1,8}|delete)$',
             "The LibraryThing work ID must be 1-8 digits."),
}

def isbn_cleaner(value):
    if value == 'delete':
        return value
    if not value:
        raise ValidationError('no identifier value found')
    elif value == 'delete':
        return value
    isbn = ISBN(value)
    if isbn.error:
        raise ValidationError(isbn.error)
    isbn.validate()
    return isbn.to_string()

def olwk_cleaner(value):
    if not value == 'delete' and value.startswith('/works/'):
        value = '/works/{}'.format(value)
    return value

doi_match = re.compile(r'10\.\d+/\S+')

def doi_cleaner(value):
    if not value == 'delete' and not value.startswith('10.'):
        try:
            return doi_match.search(value).group(0)
        except AttributeError:
            return ''
    return value

ID_MORE_VALIDATION = {
    'isbn': isbn_cleaner,
    'olwk': olwk_cleaner,
    'doi': doi_cleaner,
}

def identifier_cleaner(id_type, quiet=False):
    if ID_VALIDATION.has_key(id_type):
        (regex, err_msg) = ID_VALIDATION[id_type]
        extra = ID_MORE_VALIDATION.get(id_type, None)
        if isinstance(regex, (str, unicode)):
            regex = re.compile(regex)
        def cleaner(value):
            if not value:
                return None
            try:
                if regex.match(value):
                    if extra:
                        value = extra(value)
                    return value
                else:
                    raise ValidationError(err_msg)
            except ValidationError as ve:
                if quiet:
                    return None
                else:
                    raise ve
        return cleaner
    return lambda value: value

def test_file(the_file, fformat):
    if the_file and the_file.name:
        if fformat == 'epub':
            try:
                book = EPUB(the_file.file)
            except Exception as e:
                raise ValidationError(_('Are you sure this is an EPUB file?: %s' % e))
        elif fformat == 'mobi':
            try:
                book = Mobi(the_file.file)
                book.parse()
            except Exception as e:
                raise ValidationError(_('Are you sure this is a MOBI file?: %s' % e))
        elif fformat == 'pdf':
            try:
                PdfFileReader(the_file.file)
            except Exception, e:
                raise ValidationError(_('%s is not a valid PDF file' % the_file.name))
    return True

def valid_xml_char_ordinal(c):
    codepoint = ord(c)
    # conditions ordered by presumed frequency
    return (
        0x20 <= codepoint <= 0xD7FF or
        codepoint in (0x9, 0xA, 0xD) or
        0xE000 <= codepoint <= 0xFFFD or
        0x10000 <= codepoint <= 0x10FFFF
        )

def valid_subject(subject_name):
    num_commas = 0
    for c in subject_name:
        if not valid_xml_char_ordinal(c):
            return False
        if c == ',':
            num_commas += 1
            if num_commas > 2:
                return False
    return True

reverse_name_comma = re.compile(r',(?! *Jr[\., ])')

def unreverse_name(name):
    name = name.strip('.')
    if not reverse_name_comma.search(name):
        return name
    (last, rest) = name.split(',', 1)
    if not ',' in rest:
        return '%s %s' % (rest.strip(), last.strip())
    (first, rest) = rest.split(',', 1)
    return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())

def authlist_cleaner(authlist):
    ''' given a author string or list of author strings, checks that the author string
        is not a list of author names and that no author is repeated'''
    if isinstance(authlist, str):
        authlist = [authlist]
    cleaned = []
    for auth in authlist:
        for cleaned_auth in auth_cleaner(auth):
            if cleaned_auth not in cleaned:
                cleaned.append(cleaned_auth)
    return cleaned

# Match comma but not ", Jr"
comma_list_delim = re.compile(r',(?! *Jr[\., ])')
spaces = re.compile(r'\s+')
_and_ = re.compile(r',? (and|\&) ')
semicolon_list_delim = re.compile(r'[\;|\&]')
reversed_name = re.compile(r'(de |la |los |von |van )*\w+, \w+.?( \w+.?)?(, Jr\.?)?')

def auth_cleaner(auth):
    ''' given a author string checks that the author string
        is not a list of author names'''
    cleaned = []
    if ';' in auth or reversed_name.match(auth):
        authlist = semicolon_list_delim.split(auth)
        authlist = [unreverse_name(name) for name in authlist]
    else:
        auth = _and_.sub(',', auth)
        authlist = comma_list_delim.split(auth)
    for auth in authlist:
        cleaned.append(spaces.sub(' ', auth.strip()))
    return cleaned

MATCHYEAR = re.compile(r'(1|2)\d\d\d')
MATCHYMD = re.compile(r'(1|2)\d\d\d-\d\d-\d\d')

def validate_date(date_string):
    ymd = MATCHYMD.search(date_string)
    if ymd:
        return ymd.group(0)
    try:
        date = parse(date_string.strip(), default=datetime.date(999, 1, 1))
        if date.year != 999:
            return date.strftime('%Y')
    except ValueError:
        year = MATCHYEAR.search(date_string)
        if year:
            return year.group(0)
        return ''
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`# encoding: utf-8`
			`'''`
			`methods to validate and clean identifiers`
			`'''`
			`import re`
added date validation 2018-01-03 18:30:36 +00:00			`import datetime`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00
lint 2018-01-03 18:43:02 +00:00			`from dateutil.parser import parse`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`from PyPDF2 import PdfFileReader`

scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`from django.forms import ValidationError`
lint 2018-01-03 18:43:02 +00:00			`from django.utils.translation import ugettext_lazy as _`

'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`from regluit.pyepub import EPUB`
			`from regluit.mobi import Mobi`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`from .isbn import ISBN`

			`ID_VALIDATION = {`
			`'http': (re.compile(r"(https?\|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$",`
lint 2018-01-03 18:43:02 +00:00			`flags=re.IGNORECASE\|re.S),`
			`"The Web Address must be a valid http(s) URL."),`
			`'isbn': (r'^([\dxX\-–— ]+\|delete)$',`
			`"The ISBN must be a valid ISBN-13."),`
			`'doab': (r'^(\d{1,6}\|delete)$',`
			`"The value must be 1-6 digits."),`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`'gtbg': (r'^(\d{1,6}\|delete)$',`
lint 2018-01-03 18:43:02 +00:00			`"The Gutenberg number must be 1-6 digits."),`
			`'doi': (r'^(https?://dx\.doi\.org/\|https?://doi\.org/)?(10\.\d+/\S+\|delete)$',`
			`"The DOI value must be a valid DOI."),`
			`'oclc': (r'^(\d{8,12}\|delete)$',`
			`"The OCLCnum must be 8 or more digits."),`
			`'goog': (r'^([a-zA-Z0-9\-_]{12}\|delete)$',`
			`"The Google id must be 12 alphanumeric characters, dash or underscore."),`
			`'gdrd': (r'^(\d{1,8}\|delete)$',`
			`"The Goodreads ID must be 1-8 digits."),`
			`'thng': (r'(^\d{1,8}\|delete)$',`
			`"The LibraryThing ID must be 1-8 digits."),`
			`'olwk': (r'^(/works/\)?OLd{1,8}W\|delete)$',`
			`"The Open Library Work ID looks like 'OL####W'."),`
			`'glue': (r'^(\d{1,6}\|delete)$',`
			`"The Unglue.it ID must be 1-6 digits."),`
			`'ltwk': (r'^(\d{1,8}\|delete)$',`
			`"The LibraryThing work ID must be 1-8 digits."),`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`}`

			`def isbn_cleaner(value):`
			`if value == 'delete':`
			`return value`
			`if not value:`
bad variable reference 2017-09-04 20:10:55 +00:00			`raise ValidationError('no identifier value found')`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`elif value == 'delete':`
			`return value`
lint 2018-01-03 18:43:02 +00:00			`isbn = ISBN(value)`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`if isbn.error:`
fix exception, refine auth parsing 2017-10-27 16:08:27 +00:00			`raise ValidationError(isbn.error)`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`isbn.validate()`
			`return isbn.to_string()`

			`def olwk_cleaner(value):`
			`if not value == 'delete' and value.startswith('/works/'):`
			`value = '/works/{}'.format(value)`
			`return value`

lint 2018-01-03 18:43:02 +00:00			`doi_match = re.compile(r'10\.\d+/\S+')`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00
			`def doi_cleaner(value):`
			`if not value == 'delete' and not value.startswith('10.'):`
fix doi validation 2017-12-06 23:12:46 +00:00			`try:`
			`return doi_match.search(value).group(0)`
			`except AttributeError:`
			`return ''`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`return value`
lint 2018-01-03 18:43:02 +00:00
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`ID_MORE_VALIDATION = {`
			`'isbn': isbn_cleaner,`
			`'olwk': olwk_cleaner,`
fix doi validation 2017-12-06 23:12:46 +00:00			`'doi': doi_cleaner,`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`}`

gather isbns from schema.org and stop raising unwanted exceptions 2017-11-06 17:42:52 +00:00			`def identifier_cleaner(id_type, quiet=False):`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`if ID_VALIDATION.has_key(id_type):`
			`(regex, err_msg) = ID_VALIDATION[id_type]`
			`extra = ID_MORE_VALIDATION.get(id_type, None)`
			`if isinstance(regex, (str, unicode)):`
			`regex = re.compile(regex)`
			`def cleaner(value):`
			`if not value:`
			`return None`
gather isbns from schema.org and stop raising unwanted exceptions 2017-11-06 17:42:52 +00:00			`try:`
			`if regex.match(value):`
			`if extra:`
			`value = extra(value)`
			`return value`
			`else:`
			`raise ValidationError(err_msg)`
			`except ValidationError as ve:`
			`if quiet:`
			`return None`
			`else:`
			`raise ve`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`return cleaner`
			`return lambda value: value`

'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`def test_file(the_file, fformat):`
			`if the_file and the_file.name:`
			`if fformat == 'epub':`
			`try:`
			`book = EPUB(the_file.file)`
			`except Exception as e:`
lint 2018-01-03 18:43:02 +00:00			`raise ValidationError(_('Are you sure this is an EPUB file?: %s' % e))`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`elif fformat == 'mobi':`
			`try:`
			`book = Mobi(the_file.file)`
			`book.parse()`
			`except Exception as e:`
lint 2018-01-03 18:43:02 +00:00			`raise ValidationError(_('Are you sure this is a MOBI file?: %s' % e))`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`elif fformat == 'pdf':`
			`try:`
lint 2018-01-03 18:43:02 +00:00			`PdfFileReader(the_file.file)`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`except Exception, e:`
lint 2018-01-03 18:43:02 +00:00			`raise ValidationError(_('%s is not a valid PDF file' % the_file.name))`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`return True`

precheck every new subject fix bug with '/' in subject interpret ';' as list delimiter add cleaner script 2017-09-15 19:55:37 +00:00			`def valid_xml_char_ordinal(c):`
			`codepoint = ord(c)`
			`# conditions ordered by presumed frequency`
			`return (`
			`0x20 <= codepoint <= 0xD7FF or`
			`codepoint in (0x9, 0xA, 0xD) or`
			`0xE000 <= codepoint <= 0xFFFD or`
			`0x10000 <= codepoint <= 0x10FFFF`
			`)`

lint 2018-01-03 18:43:02 +00:00			`def valid_subject(subject_name):`
precheck every new subject fix bug with '/' in subject interpret ';' as list delimiter add cleaner script 2017-09-15 19:55:37 +00:00			`num_commas = 0`
			`for c in subject_name:`
			`if not valid_xml_char_ordinal(c):`
			`return False`
			`if c == ',':`
			`num_commas += 1`
			`if num_commas > 2:`
			`return False`
			`return True`

improve namelist parsing 2017-10-06 20:04:59 +00:00			`reverse_name_comma = re.compile(r',(?! *Jr[\., ])')`

			`def unreverse_name(name):`
fix exception, refine auth parsing 2017-10-27 16:08:27 +00:00			`name = name.strip('.')`
improve namelist parsing 2017-10-06 20:04:59 +00:00			`if not reverse_name_comma.search(name):`
			`return name`
			`(last, rest) = name.split(',', 1)`
			`if not ',' in rest:`
			`return '%s %s' % (rest.strip(), last.strip())`
			`(first, rest) = rest.split(',', 1)`
			`return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())`

authlist cleaner, definition lists 2017-09-28 17:25:56 +00:00			`def authlist_cleaner(authlist):`
			`''' given a author string or list of author strings, checks that the author string`
			`is not a list of author names and that no author is repeated'''`
			`if isinstance(authlist, str):`
			`authlist = [authlist]`
			`cleaned = []`
			`for auth in authlist:`
			`for cleaned_auth in auth_cleaner(auth):`
			`if cleaned_auth not in cleaned:`
			`cleaned.append(cleaned_auth)`
			`return cleaned`

			`# Match comma but not ", Jr"`
			`comma_list_delim = re.compile(r',(?! *Jr[\., ])')`
			`spaces = re.compile(r'\s+')`
improve namelist parsing 2017-10-06 20:04:59 +00:00			`_and_ = re.compile(r',? (and\|\&) ')`
			`semicolon_list_delim = re.compile(r'[\;\|\&]')`
fix exception, refine auth parsing 2017-10-27 16:08:27 +00:00			`reversed_name = re.compile(r'(de \|la \|los \|von \|van )*\w+, \w+.?( \w+.?)?(, Jr\.?)?')`
authlist cleaner, definition lists 2017-09-28 17:25:56 +00:00
			`def auth_cleaner(auth):`
			`''' given a author string checks that the author string`
			`is not a list of author names'''`
			`cleaned = []`
fix exception, refine auth parsing 2017-10-27 16:08:27 +00:00			`if ';' in auth or reversed_name.match(auth):`
lint 2018-01-03 18:43:02 +00:00			`authlist = semicolon_list_delim.split(auth)`
improve namelist parsing 2017-10-06 20:04:59 +00:00			`authlist = [unreverse_name(name) for name in authlist]`
authlist cleaner, definition lists 2017-09-28 17:25:56 +00:00			`else:`
improve namelist parsing 2017-10-06 20:04:59 +00:00			`auth = _and_.sub(',', auth)`
authlist cleaner, definition lists 2017-09-28 17:25:56 +00:00			`authlist = comma_list_delim.split(auth)`
			`for auth in authlist:`
			`cleaned.append(spaces.sub(' ', auth.strip()))`
			`return cleaned`
added date validation 2018-01-03 18:30:36 +00:00
			`MATCHYEAR = re.compile(r'(1\|2)\d\d\d')`
			`MATCHYMD = re.compile(r'(1\|2)\d\d\d-\d\d-\d\d')`

			`def validate_date(date_string):`
			`ymd = MATCHYMD.search(date_string)`
			`if ymd:`
			`return ymd.group(0)`
			`try:`
lint 2018-01-03 18:43:02 +00:00			`date = parse(date_string.strip(), default=datetime.date(999, 1, 1))`
added date validation 2018-01-03 18:30:36 +00:00			`if date.year != 999:`
			`return date.strftime('%Y')`
			`except ValueError:`
			`year = MATCHYEAR.search(date_string)`
			`if year:`
			`return year.group(0)`
lint 2018-01-03 18:43:02 +00:00			`return ''`