improve namelist parsing

2017-10-06 16:04:59 -04:00 · 2017-10-06 16:04:59 -04:00 · 86e38d08bb
parent c00d616c77
commit 86e38d08bb
2 changed files with 18 additions and 15 deletions
--- a/core/bookloader.py
+++ b/core/bookloader.py
@ -37,7 +37,7 @@ from regluit.utils.localdatetime import now
 from . import cc
 from . import models
 from .parameters import WORK_IDENTIFIERS
-from .validation import identifier_cleaner
+from .validation import identifier_cleaner, unreverse_name
 from .loaders.scrape import get_scraper, scrape_sitemap

 logger = logging.getLogger(__name__)
@ -735,16 +735,6 @@ IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olw
    ('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'),
 ]

-def unreverse(name):
-    if not ',' in name:
-        return name
-    (last, rest) = name.split(',', 1)
-    if not ',' in rest:
-        return '%s %s' % (rest.strip(), last.strip())
-    (first, rest) = rest.split(',', 1)
-    return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())
-
-
 def load_from_yaml(yaml_url, test_mode=False):
    """
    This really should be called 'load_from_github_yaml'
@ -877,7 +867,7 @@ class BasePandataLoader(object):
                rel_code = inverse_marc_rels.get(key, 'aut')
                creators = creators if isinstance(creators, list) else [creators]
                for creator in creators:
-                    edition.add_author(unreverse(creator.get('agent_name', '')), relation=rel_code)
+                    edition.add_author(unreverse_name(creator.get('agent_name', '')), relation=rel_code)
        for yaml_subject in metadata.subjects: #always add yaml subjects (don't clear)
            if isinstance(yaml_subject, tuple):
                (authority, heading)  = yaml_subject
--- a/core/validation.py
+++ b/core/validation.py
@ -129,6 +129,17 @@ def valid_subject( subject_name ):
                return False
    return True

+reverse_name_comma = re.compile(r',(?! *Jr[\., ])')
+
+def unreverse_name(name):
+    if not reverse_name_comma.search(name):
+        return name
+    (last, rest) = name.split(',', 1)
+    if not ',' in rest:
+        return '%s %s' % (rest.strip(), last.strip())
+    (first, rest) = rest.split(',', 1)
+    return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())
+
 def authlist_cleaner(authlist):
    ''' given a author string or list of author strings, checks that the author string
        is not a list of author names and that no author is repeated'''
@ -144,16 +155,18 @@ def authlist_cleaner(authlist):
 # Match comma but not ", Jr"
 comma_list_delim = re.compile(r',(?! *Jr[\., ])')
 spaces = re.compile(r'\s+')
-_and_ = re.compile(r',? and ')
+_and_ = re.compile(r',? (and|\&) ')
+semicolon_list_delim = re.compile(r'[\;|\&]')

 def auth_cleaner(auth):
    ''' given a author string checks that the author string
        is not a list of author names'''
    cleaned = []
-    auth = _and_.sub(',', auth)
    if ';' in auth:
-        authlist =  auth.split(';')
+        authlist =  semicolon_list_delim.split(auth)
+        authlist = [unreverse_name(name) for name in authlist]
    else:
+        auth = _and_.sub(',', auth)
        authlist = comma_list_delim.split(auth)
    for auth in authlist:
        cleaned.append(spaces.sub(' ', auth.strip()))