regluit/core/validation.py

# encoding: utf-8
'''
methods to validate and clean identifiers
'''
import re

from PyPDF2 import PdfFileReader

from django.forms import ValidationError
from regluit.pyepub import EPUB
from regluit.mobi import Mobi
from .isbn import ISBN

ID_VALIDATION = {
    'http': (re.compile(r"(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$",
         flags=re.IGNORECASE|re.S ), 
         "The Web Address must be a valid http(s) URL."),  
    'isbn':  (r'^([\dxX\-–— ]+|delete)$', 
        "The ISBN must be a valid ISBN-13."),
    'doab': (r'^(\d{1,6}|delete)$', 
        "The value must be 1-6 digits."),
    'gtbg': (r'^(\d{1,6}|delete)$',
        "The Gutenberg number must be 1-6 digits."),
    'doi': (r'^(https?://dx\.doi\.org/|https?://doi\.org/)?(10\.\d+/\S+|delete)$', 
        "The DOI value must be a valid DOI."),
    'oclc': (r'^(\d{8,12}|delete)$', 
        "The OCLCnum must be 8 or more digits."),
    'goog': (r'^([a-zA-Z0-9\-_]{12}|delete)$', 
        "The Google id must be 12 alphanumeric characters, dash or underscore."),
    'gdrd': (r'^(\d{1,8}|delete)$', 
        "The Goodreads ID must be 1-8 digits."),
    'thng': (r'(^\d{1,8}|delete)$', 
        "The LibraryThing ID must be 1-8 digits."),
    'olwk': (r'^(/works/\)?OLd{1,8}W|delete)$', 
        "The Open Library Work ID looks like 'OL####W'."),
    'glue': (r'^(\d{1,6}|delete)$', 
        "The Unglue.it ID must be 1-6 digits."),
    'ltwk': (r'^(\d{1,8}|delete)$', 
        "The LibraryThing work ID must be 1-8 digits."),
}

def isbn_cleaner(value):
    if value == 'delete':
        return value
    if not value:
        raise forms.ValidationError('no identifier value found')
    elif value == 'delete':
        return value
    isbn=ISBN(value)
    if isbn.error:
        raise forms.ValidationError(isbn.error)
    isbn.validate()
    return isbn.to_string()

def olwk_cleaner(value):
    if not value == 'delete' and value.startswith('/works/'):
        value = '/works/{}'.format(value)
    return value

doi_match = re.compile( r'10\.\d+/\S+')

def doi_cleaner(value):
    if not value == 'delete' and not value.startswith('10.'):
        return doi_match.match(value).group(0)
    return value
        
ID_MORE_VALIDATION = {
    'isbn': isbn_cleaner,
    'olwk': olwk_cleaner,
    'olwk': doi_cleaner,
}

def identifier_cleaner(id_type):
    if ID_VALIDATION.has_key(id_type):
        (regex, err_msg) = ID_VALIDATION[id_type]
        extra = ID_MORE_VALIDATION.get(id_type, None)
        if isinstance(regex, (str, unicode)):
            regex = re.compile(regex)
        def cleaner(value):
            if not value:
                return None
            if regex.match(value):
                if extra:
                    value = extra(value)
                return value
            else:
                raise ValidationError(err_msg)
        return cleaner
    return lambda value: value

def test_file(the_file, fformat):
    if the_file and the_file.name:
        if fformat == 'epub':
            try:
                book = EPUB(the_file.file)
            except Exception as e:
                raise forms.ValidationError(_('Are you sure this is an EPUB file?: %s' % e) )
        elif fformat == 'mobi':
            try:
                book = Mobi(the_file.file)
                book.parse()
            except Exception as e:
                raise forms.ValidationError(_('Are you sure this is a MOBI file?: %s' % e) )
        elif fformat == 'pdf':
            try:
                doc = PdfFileReader(the_file.file)
            except Exception, e:
                raise forms.ValidationError(_('%s is not a valid PDF file' % the_file.name) )
    return True
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`# encoding: utf-8`
			`'''`
			`methods to validate and clean identifiers`
			`'''`
			`import re`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00
			`from PyPDF2 import PdfFileReader`

scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`from django.forms import ValidationError`
'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`from regluit.pyepub import EPUB`
			`from regluit.mobi import Mobi`
scrapes the metadata also moves id validation to core 2017-08-03 20:15:06 +00:00			`from .isbn import ISBN`

			`ID_VALIDATION = {`
			`'http': (re.compile(r"(https?\|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$",`
			`flags=re.IGNORECASE\|re.S ),`
			`"The Web Address must be a valid http(s) URL."),`
			`'isbn': (r'^([\dxX\-–— ]+\|delete)$',`
			`"The ISBN must be a valid ISBN-13."),`
			`'doab': (r'^(\d{1,6}\|delete)$',`
			`"The value must be 1-6 digits."),`
			`'gtbg': (r'^(\d{1,6}\|delete)$',`
			`"The Gutenberg number must be 1-6 digits."),`
			`'doi': (r'^(https?://dx\.doi\.org/\|https?://doi\.org/)?(10\.\d+/\S+\|delete)$',`
			`"The DOI value must be a valid DOI."),`
			`'oclc': (r'^(\d{8,12}\|delete)$',`
			`"The OCLCnum must be 8 or more digits."),`
			`'goog': (r'^([a-zA-Z0-9\-_]{12}\|delete)$',`
			`"The Google id must be 12 alphanumeric characters, dash or underscore."),`
			`'gdrd': (r'^(\d{1,8}\|delete)$',`
			`"The Goodreads ID must be 1-8 digits."),`
			`'thng': (r'(^\d{1,8}\|delete)$',`
			`"The LibraryThing ID must be 1-8 digits."),`
			`'olwk': (r'^(/works/\)?OLd{1,8}W\|delete)$',`
			`"The Open Library Work ID looks like 'OL####W'."),`
			`'glue': (r'^(\d{1,6}\|delete)$',`
			`"The Unglue.it ID must be 1-6 digits."),`
			`'ltwk': (r'^(\d{1,8}\|delete)$',`
			`"The LibraryThing work ID must be 1-8 digits."),`
			`}`

			`def isbn_cleaner(value):`
			`if value == 'delete':`
			`return value`
			`if not value:`
			`raise forms.ValidationError('no identifier value found')`
			`elif value == 'delete':`
			`return value`
			`isbn=ISBN(value)`
			`if isbn.error:`
			`raise forms.ValidationError(isbn.error)`
			`isbn.validate()`
			`return isbn.to_string()`

			`def olwk_cleaner(value):`
			`if not value == 'delete' and value.startswith('/works/'):`
			`value = '/works/{}'.format(value)`
			`return value`

			`doi_match = re.compile( r'10\.\d+/\S+')`

			`def doi_cleaner(value):`
			`if not value == 'delete' and not value.startswith('10.'):`
			`return doi_match.match(value).group(0)`
			`return value`

			`ID_MORE_VALIDATION = {`
			`'isbn': isbn_cleaner,`
			`'olwk': olwk_cleaner,`
			`'olwk': doi_cleaner,`
			`}`

			`def identifier_cleaner(id_type):`
			`if ID_VALIDATION.has_key(id_type):`
			`(regex, err_msg) = ID_VALIDATION[id_type]`
			`extra = ID_MORE_VALIDATION.get(id_type, None)`
			`if isinstance(regex, (str, unicode)):`
			`regex = re.compile(regex)`
			`def cleaner(value):`
			`if not value:`
			`return None`
			`if regex.match(value):`
			`if extra:`
			`value = extra(value)`
			`return value`
			`else:`
			`raise ValidationError(err_msg)`
			`return cleaner`
			`return lambda value: value`

'format' is a built-in test_file was not doing anything because format parameter not passed. Didn't raise error because 'format' is a built-in function 2017-08-07 20:13:22 +00:00			`def test_file(the_file, fformat):`
			`if the_file and the_file.name:`
			`if fformat == 'epub':`
			`try:`
			`book = EPUB(the_file.file)`
			`except Exception as e:`
			`raise forms.ValidationError(_('Are you sure this is an EPUB file?: %s' % e) )`
			`elif fformat == 'mobi':`
			`try:`
			`book = Mobi(the_file.file)`
			`book.parse()`
			`except Exception as e:`
			`raise forms.ValidationError(_('Are you sure this is a MOBI file?: %s' % e) )`
			`elif fformat == 'pdf':`
			`try:`
			`doc = PdfFileReader(the_file.file)`
			`except Exception, e:`
			`raise forms.ValidationError(_('%s is not a valid PDF file' % the_file.name) )`
			`return True`