add author string cleanup

2022-09-20 20:18:34 -04:00 · 2022-09-20 20:18:34 -04:00 · 3e1ed44408
parent 723c0a0015
commit 3e1ed44408
4 changed files with 41 additions and 30 deletions
--- a/.python-version
+++ b/.python-version
@ -1 +1 @@
-3.8.13
+3.9.11
--- a/core/management/commands/clean_db_strings.py
+++ b/core/management/commands/clean_db_strings.py
@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand
 from django.db import IntegrityError

 from regluit.core import models
-from regluit.utils.text import sanitize_line, remove_badxml
+from regluit.utils.text import sanitize_line, remove_author_junk, remove_badxml


 class Command(BaseCommand):
@ -36,18 +36,19 @@ class Command(BaseCommand):
                edition_titles_fixed +=1
        self.stdout.write("edition_titles_fixed = {}".format(edition_titles_fixed))
        for author in models.Author.objects.all():
-            if sanitize_line(author.name) != author.name:
-                author.name = sanitize_line(author.name)
-                try:
-                    author.save()
-                except IntegrityError as e:
-                    # duplicate entry
-                    correct = models.Author.objects.get(name=sanitize_line(author.name))
-                    for relator in author.relator_set.all():
-                        relator.author = correct
-                        relator.save()
-                    author.delete() 
-                author_names_fixed +=1
+            if remove_author_junk(sanitize_line(author.name)) != author.name:
+                author.name = remove_author_junk(sanitize_line(author.name))
+                if author.name:
+                    try:
+                        author.save()
+                    except IntegrityError as e:
+                        # duplicate entry
+                        correct = models.Author.objects.get(name=sanitize_line(author.name))
+                        for relator in author.relator_set.all():
+                            relator.author = correct
+                            relator.save()
+                        author.delete() 
+                    author_names_fixed +=1
        self.stdout.write("author_names_fixed = {}".format(author_names_fixed))
        for publishername in models.PublisherName.objects.all():
            if sanitize_line(publishername.name) != publishername.name:
--- a/core/validation.py
+++ b/core/validation.py
@ -14,6 +14,7 @@ from django.utils.translation import ugettext_lazy as _

 from pyepub import EPUB
 from .isbn import ISBN
+from regluit.utils.text import remove_author_junk

 logger = logging.getLogger(__name__)

@ -207,6 +208,7 @@ def auth_cleaner(auth):
        auth = _and_.sub(',', auth)
        authlist = comma_list_delim.split(auth)
    for auth in authlist:
+        auth = remove_author_junk(auth)
        cleaned.append(spaces.sub(' ', auth.strip()))
    return cleaned

--- a/utils/text.py
+++ b/utils/text.py
@ -4,22 +4,24 @@ import unicodedata


 #https://stackoverflow.com/questions/1707890/fast-way-to-filter-illegal-xml-unicode-chars-in-python
-_illegal_unichrs = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F), 
-                        (0x7F, 0x84), (0x86, 0x9F), 
-                        (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)] 
-if sys.maxunicode >= 0x10000:  # not narrow build 
-        _illegal_unichrs.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF), 
-                                 (0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF), 
-                                 (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF), 
-                                 (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF), 
-                                 (0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF), 
-                                 (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF), 
-                                 (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF), 
-                                 (0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)]) 
+_illegal_unichrs = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F),
+                        (0x7F, 0x84), (0x86, 0x9F),
+                        (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)]
+if sys.maxunicode >= 0x10000:  # not narrow build
+        _illegal_unichrs.extend([(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF),
+                                 (0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF),
+                                 (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF),
+                                 (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF),
+                                 (0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF),
+                                 (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF),
+                                 (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF),
+                                 (0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)])

-_illegal_ranges = ["%s-%s" % (chr(low), chr(high)) 
-                   for (low, high) in _illegal_unichrs] 
-_illegal_xml_chars_RE = re.compile(u'[%s]' % u''.join(_illegal_ranges)) 
+_illegal_ranges = ["%s-%s" % (chr(low), chr(high))
+                   for (low, high) in _illegal_unichrs]
+_illegal_xml_chars_RE = re.compile('[%s]' % ''.join(_illegal_ranges))
+
+_ends_in_num = re.compile(r'\W*\d+$')

 def remove_badxml(s):
    return _illegal_xml_chars_RE.sub('', s)
@ -29,5 +31,11 @@ _ws_runs_RE = re.compile(r'[\r\n\t]+')
 def sanitize_ws(s):
    return _ws_runs_RE.sub(u' ', s)

+
 def sanitize_line(s):
-    return remove_badxml(sanitize_ws(s)).strip()
+    return unicodedata.normalize('NFC', remove_badxml(sanitize_ws(s)).strip())
+
+def remove_author_junk(authname):
+    if 'ORCID:' in authname:
+        authname = authname.split('ORCID:')[0].strip()
+    return _ends_in_num.sub('', authname)
 @ -1 +1 @@
 .8.13
 .9.11