add author string cleanup
parent
723c0a0015
commit
3e1ed44408
|
@ -1 +1 @@
|
|||
3.8.13
|
||||
3.9.11
|
||||
|
|
|
@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand
|
|||
from django.db import IntegrityError
|
||||
|
||||
from regluit.core import models
|
||||
from regluit.utils.text import sanitize_line, remove_badxml
|
||||
from regluit.utils.text import sanitize_line, remove_author_junk, remove_badxml
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
@ -36,18 +36,19 @@ class Command(BaseCommand):
|
|||
edition_titles_fixed +=1
|
||||
self.stdout.write("edition_titles_fixed = {}".format(edition_titles_fixed))
|
||||
for author in models.Author.objects.all():
|
||||
if sanitize_line(author.name) != author.name:
|
||||
author.name = sanitize_line(author.name)
|
||||
try:
|
||||
author.save()
|
||||
except IntegrityError as e:
|
||||
# duplicate entry
|
||||
correct = models.Author.objects.get(name=sanitize_line(author.name))
|
||||
for relator in author.relator_set.all():
|
||||
relator.author = correct
|
||||
relator.save()
|
||||
author.delete()
|
||||
author_names_fixed +=1
|
||||
if remove_author_junk(sanitize_line(author.name)) != author.name:
|
||||
author.name = remove_author_junk(sanitize_line(author.name))
|
||||
if author.name:
|
||||
try:
|
||||
author.save()
|
||||
except IntegrityError as e:
|
||||
# duplicate entry
|
||||
correct = models.Author.objects.get(name=sanitize_line(author.name))
|
||||
for relator in author.relator_set.all():
|
||||
relator.author = correct
|
||||
relator.save()
|
||||
author.delete()
|
||||
author_names_fixed +=1
|
||||
self.stdout.write("author_names_fixed = {}".format(author_names_fixed))
|
||||
for publishername in models.PublisherName.objects.all():
|
||||
if sanitize_line(publishername.name) != publishername.name:
|
||||
|
|
|
@ -14,6 +14,7 @@ from django.utils.translation import ugettext_lazy as _
|
|||
|
||||
from pyepub import EPUB
|
||||
from .isbn import ISBN
|
||||
from regluit.utils.text import remove_author_junk
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -207,6 +208,7 @@ def auth_cleaner(auth):
|
|||
auth = _and_.sub(',', auth)
|
||||
authlist = comma_list_delim.split(auth)
|
||||
for auth in authlist:
|
||||
auth = remove_author_junk(auth)
|
||||
cleaned.append(spaces.sub(' ', auth.strip()))
|
||||
return cleaned
|
||||
|
||||
|
|
|
@ -4,22 +4,24 @@ import unicodedata
|
|||
|
||||
|
||||
#https://stackoverflow.com/questions/1707890/fast-way-to-filter-illegal-xml-unicode-chars-in-python
|
||||
_illegal_unichrs = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F),
|
||||
(0x7F, 0x84), (0x86, 0x9F),
|
||||
(0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)]
|
||||
if sys.maxunicode >= 0x10000: # not narrow build
|
||||
_illegal_unichrs.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF),
|
||||
(0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF),
|
||||
(0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF),
|
||||
(0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF),
|
||||
(0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF),
|
||||
(0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF),
|
||||
(0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF),
|
||||
(0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)])
|
||||
_illegal_unichrs = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F),
|
||||
(0x7F, 0x84), (0x86, 0x9F),
|
||||
(0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)]
|
||||
if sys.maxunicode >= 0x10000: # not narrow build
|
||||
_illegal_unichrs.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF),
|
||||
(0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF),
|
||||
(0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF),
|
||||
(0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF),
|
||||
(0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF),
|
||||
(0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF),
|
||||
(0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF),
|
||||
(0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)])
|
||||
|
||||
_illegal_ranges = ["%s-%s" % (chr(low), chr(high))
|
||||
for (low, high) in _illegal_unichrs]
|
||||
_illegal_xml_chars_RE = re.compile(u'[%s]' % u''.join(_illegal_ranges))
|
||||
_illegal_ranges = ["%s-%s" % (chr(low), chr(high))
|
||||
for (low, high) in _illegal_unichrs]
|
||||
_illegal_xml_chars_RE = re.compile('[%s]' % ''.join(_illegal_ranges))
|
||||
|
||||
_ends_in_num = re.compile(r'\W*\d+$')
|
||||
|
||||
def remove_badxml(s):
|
||||
return _illegal_xml_chars_RE.sub('', s)
|
||||
|
@ -29,5 +31,11 @@ _ws_runs_RE = re.compile(r'[\r\n\t]+')
|
|||
def sanitize_ws(s):
|
||||
return _ws_runs_RE.sub(u' ', s)
|
||||
|
||||
|
||||
def sanitize_line(s):
|
||||
return remove_badxml(sanitize_ws(s)).strip()
|
||||
return unicodedata.normalize('NFC', remove_badxml(sanitize_ws(s)).strip())
|
||||
|
||||
def remove_author_junk(authname):
|
||||
if 'ORCID:' in authname:
|
||||
authname = authname.split('ORCID:')[0].strip()
|
||||
return _ends_in_num.sub('', authname)
|
||||
|
|
Loading…
Reference in New Issue