improve namelist parsing

pull/43/head
eric 2017-10-06 16:04:59 -04:00
parent c00d616c77
commit 86e38d08bb
2 changed files with 18 additions and 15 deletions

View File

@ -37,7 +37,7 @@ from regluit.utils.localdatetime import now
from . import cc
from . import models
from .parameters import WORK_IDENTIFIERS
from .validation import identifier_cleaner
from .validation import identifier_cleaner, unreverse_name
from .loaders.scrape import get_scraper, scrape_sitemap
logger = logging.getLogger(__name__)
@ -735,16 +735,6 @@ IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olw
('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'),
]
def unreverse(name):
if not ',' in name:
return name
(last, rest) = name.split(',', 1)
if not ',' in rest:
return '%s %s' % (rest.strip(), last.strip())
(first, rest) = rest.split(',', 1)
return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())
def load_from_yaml(yaml_url, test_mode=False):
"""
This really should be called 'load_from_github_yaml'
@ -877,7 +867,7 @@ class BasePandataLoader(object):
rel_code = inverse_marc_rels.get(key, 'aut')
creators = creators if isinstance(creators, list) else [creators]
for creator in creators:
edition.add_author(unreverse(creator.get('agent_name', '')), relation=rel_code)
edition.add_author(unreverse_name(creator.get('agent_name', '')), relation=rel_code)
for yaml_subject in metadata.subjects: #always add yaml subjects (don't clear)
if isinstance(yaml_subject, tuple):
(authority, heading) = yaml_subject

View File

@ -129,6 +129,17 @@ def valid_subject( subject_name ):
return False
return True
reverse_name_comma = re.compile(r',(?! *Jr[\., ])')
def unreverse_name(name):
if not reverse_name_comma.search(name):
return name
(last, rest) = name.split(',', 1)
if not ',' in rest:
return '%s %s' % (rest.strip(), last.strip())
(first, rest) = rest.split(',', 1)
return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())
def authlist_cleaner(authlist):
''' given a author string or list of author strings, checks that the author string
is not a list of author names and that no author is repeated'''
@ -144,16 +155,18 @@ def authlist_cleaner(authlist):
# Match comma but not ", Jr"
comma_list_delim = re.compile(r',(?! *Jr[\., ])')
spaces = re.compile(r'\s+')
_and_ = re.compile(r',? and ')
_and_ = re.compile(r',? (and|\&) ')
semicolon_list_delim = re.compile(r'[\;|\&]')
def auth_cleaner(auth):
''' given a author string checks that the author string
is not a list of author names'''
cleaned = []
auth = _and_.sub(',', auth)
if ';' in auth:
authlist = auth.split(';')
authlist = semicolon_list_delim.split(auth)
authlist = [unreverse_name(name) for name in authlist]
else:
auth = _and_.sub(',', auth)
authlist = comma_list_delim.split(auth)
for auth in authlist:
cleaned.append(spaces.sub(' ', auth.strip()))