improve namelist parsing
parent
c00d616c77
commit
86e38d08bb
|
@ -37,7 +37,7 @@ from regluit.utils.localdatetime import now
|
|||
from . import cc
|
||||
from . import models
|
||||
from .parameters import WORK_IDENTIFIERS
|
||||
from .validation import identifier_cleaner
|
||||
from .validation import identifier_cleaner, unreverse_name
|
||||
from .loaders.scrape import get_scraper, scrape_sitemap
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -735,16 +735,6 @@ IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olw
|
|||
('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'),
|
||||
]
|
||||
|
||||
def unreverse(name):
|
||||
if not ',' in name:
|
||||
return name
|
||||
(last, rest) = name.split(',', 1)
|
||||
if not ',' in rest:
|
||||
return '%s %s' % (rest.strip(), last.strip())
|
||||
(first, rest) = rest.split(',', 1)
|
||||
return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())
|
||||
|
||||
|
||||
def load_from_yaml(yaml_url, test_mode=False):
|
||||
"""
|
||||
This really should be called 'load_from_github_yaml'
|
||||
|
@ -877,7 +867,7 @@ class BasePandataLoader(object):
|
|||
rel_code = inverse_marc_rels.get(key, 'aut')
|
||||
creators = creators if isinstance(creators, list) else [creators]
|
||||
for creator in creators:
|
||||
edition.add_author(unreverse(creator.get('agent_name', '')), relation=rel_code)
|
||||
edition.add_author(unreverse_name(creator.get('agent_name', '')), relation=rel_code)
|
||||
for yaml_subject in metadata.subjects: #always add yaml subjects (don't clear)
|
||||
if isinstance(yaml_subject, tuple):
|
||||
(authority, heading) = yaml_subject
|
||||
|
|
|
@ -129,6 +129,17 @@ def valid_subject( subject_name ):
|
|||
return False
|
||||
return True
|
||||
|
||||
reverse_name_comma = re.compile(r',(?! *Jr[\., ])')
|
||||
|
||||
def unreverse_name(name):
|
||||
if not reverse_name_comma.search(name):
|
||||
return name
|
||||
(last, rest) = name.split(',', 1)
|
||||
if not ',' in rest:
|
||||
return '%s %s' % (rest.strip(), last.strip())
|
||||
(first, rest) = rest.split(',', 1)
|
||||
return '%s %s, %s' % (first.strip(), last.strip(), rest.strip())
|
||||
|
||||
def authlist_cleaner(authlist):
|
||||
''' given a author string or list of author strings, checks that the author string
|
||||
is not a list of author names and that no author is repeated'''
|
||||
|
@ -144,16 +155,18 @@ def authlist_cleaner(authlist):
|
|||
# Match comma but not ", Jr"
|
||||
comma_list_delim = re.compile(r',(?! *Jr[\., ])')
|
||||
spaces = re.compile(r'\s+')
|
||||
_and_ = re.compile(r',? and ')
|
||||
_and_ = re.compile(r',? (and|\&) ')
|
||||
semicolon_list_delim = re.compile(r'[\;|\&]')
|
||||
|
||||
def auth_cleaner(auth):
|
||||
''' given a author string checks that the author string
|
||||
is not a list of author names'''
|
||||
cleaned = []
|
||||
auth = _and_.sub(',', auth)
|
||||
if ';' in auth:
|
||||
authlist = auth.split(';')
|
||||
authlist = semicolon_list_delim.split(auth)
|
||||
authlist = [unreverse_name(name) for name in authlist]
|
||||
else:
|
||||
auth = _and_.sub(',', auth)
|
||||
authlist = comma_list_delim.split(auth)
|
||||
for auth in authlist:
|
||||
cleaned.append(spaces.sub(' ', auth.strip()))
|
||||
|
|
Loading…
Reference in New Issue