pull/46/head
eric 2018-01-03 13:43:02 -05:00
parent e837dd6ff2
commit 6dfa1bccb4
1 changed files with 37 additions and 36 deletions

View File

@ -4,41 +4,43 @@ methods to validate and clean identifiers
'''
import re
import datetime
from dateutil.parser import parse
from dateutil.parser import parse
from PyPDF2 import PdfFileReader
from django.forms import ValidationError
from django.utils.translation import ugettext_lazy as _
from regluit.pyepub import EPUB
from regluit.mobi import Mobi
from .isbn import ISBN
ID_VALIDATION = {
'http': (re.compile(r"(https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+(/[^\s]*)?$",
flags=re.IGNORECASE|re.S ),
"The Web Address must be a valid http(s) URL."),
'isbn': (r'^([\dxX\-–— ]+|delete)$',
"The ISBN must be a valid ISBN-13."),
'doab': (r'^(\d{1,6}|delete)$',
"The value must be 1-6 digits."),
flags=re.IGNORECASE|re.S),
"The Web Address must be a valid http(s) URL."),
'isbn': (r'^([\dxX\-–— ]+|delete)$',
"The ISBN must be a valid ISBN-13."),
'doab': (r'^(\d{1,6}|delete)$',
"The value must be 1-6 digits."),
'gtbg': (r'^(\d{1,6}|delete)$',
"The Gutenberg number must be 1-6 digits."),
'doi': (r'^(https?://dx\.doi\.org/|https?://doi\.org/)?(10\.\d+/\S+|delete)$',
"The DOI value must be a valid DOI."),
'oclc': (r'^(\d{8,12}|delete)$',
"The OCLCnum must be 8 or more digits."),
'goog': (r'^([a-zA-Z0-9\-_]{12}|delete)$',
"The Google id must be 12 alphanumeric characters, dash or underscore."),
'gdrd': (r'^(\d{1,8}|delete)$',
"The Goodreads ID must be 1-8 digits."),
'thng': (r'(^\d{1,8}|delete)$',
"The LibraryThing ID must be 1-8 digits."),
'olwk': (r'^(/works/\)?OLd{1,8}W|delete)$',
"The Open Library Work ID looks like 'OL####W'."),
'glue': (r'^(\d{1,6}|delete)$',
"The Unglue.it ID must be 1-6 digits."),
'ltwk': (r'^(\d{1,8}|delete)$',
"The LibraryThing work ID must be 1-8 digits."),
"The Gutenberg number must be 1-6 digits."),
'doi': (r'^(https?://dx\.doi\.org/|https?://doi\.org/)?(10\.\d+/\S+|delete)$',
"The DOI value must be a valid DOI."),
'oclc': (r'^(\d{8,12}|delete)$',
"The OCLCnum must be 8 or more digits."),
'goog': (r'^([a-zA-Z0-9\-_]{12}|delete)$',
"The Google id must be 12 alphanumeric characters, dash or underscore."),
'gdrd': (r'^(\d{1,8}|delete)$',
"The Goodreads ID must be 1-8 digits."),
'thng': (r'(^\d{1,8}|delete)$',
"The LibraryThing ID must be 1-8 digits."),
'olwk': (r'^(/works/\)?OLd{1,8}W|delete)$',
"The Open Library Work ID looks like 'OL####W'."),
'glue': (r'^(\d{1,6}|delete)$',
"The Unglue.it ID must be 1-6 digits."),
'ltwk': (r'^(\d{1,8}|delete)$',
"The LibraryThing work ID must be 1-8 digits."),
}
def isbn_cleaner(value):
@ -48,7 +50,7 @@ def isbn_cleaner(value):
raise ValidationError('no identifier value found')
elif value == 'delete':
return value
isbn=ISBN(value)
isbn = ISBN(value)
if isbn.error:
raise ValidationError(isbn.error)
isbn.validate()
@ -59,7 +61,7 @@ def olwk_cleaner(value):
value = '/works/{}'.format(value)
return value
doi_match = re.compile( r'10\.\d+/\S+')
doi_match = re.compile(r'10\.\d+/\S+')
def doi_cleaner(value):
if not value == 'delete' and not value.startswith('10.'):
@ -68,7 +70,7 @@ def doi_cleaner(value):
except AttributeError:
return ''
return value
ID_MORE_VALIDATION = {
'isbn': isbn_cleaner,
'olwk': olwk_cleaner,
@ -105,18 +107,18 @@ def test_file(the_file, fformat):
try:
book = EPUB(the_file.file)
except Exception as e:
raise ValidationError(_('Are you sure this is an EPUB file?: %s' % e) )
raise ValidationError(_('Are you sure this is an EPUB file?: %s' % e))
elif fformat == 'mobi':
try:
book = Mobi(the_file.file)
book.parse()
except Exception as e:
raise ValidationError(_('Are you sure this is a MOBI file?: %s' % e) )
raise ValidationError(_('Are you sure this is a MOBI file?: %s' % e))
elif fformat == 'pdf':
try:
doc = PdfFileReader(the_file.file)
PdfFileReader(the_file.file)
except Exception, e:
raise ValidationError(_('%s is not a valid PDF file' % the_file.name) )
raise ValidationError(_('%s is not a valid PDF file' % the_file.name))
return True
def valid_xml_char_ordinal(c):
@ -129,7 +131,7 @@ def valid_xml_char_ordinal(c):
0x10000 <= codepoint <= 0x10FFFF
)
def valid_subject( subject_name ):
def valid_subject(subject_name):
num_commas = 0
for c in subject_name:
if not valid_xml_char_ordinal(c):
@ -176,7 +178,7 @@ def auth_cleaner(auth):
is not a list of author names'''
cleaned = []
if ';' in auth or reversed_name.match(auth):
authlist = semicolon_list_delim.split(auth)
authlist = semicolon_list_delim.split(auth)
authlist = [unreverse_name(name) for name in authlist]
else:
auth = _and_.sub(',', auth)
@ -193,12 +195,11 @@ def validate_date(date_string):
if ymd:
return ymd.group(0)
try:
date = parse(date_string.strip(), default=datetime.date(999,1,1))
date = parse(date_string.strip(), default=datetime.date(999, 1, 1))
if date.year != 999:
return date.strftime('%Y')
except ValueError:
year = MATCHYEAR.search(date_string)
if year:
return year.group(0)
else:
return ''
return ''