authlist cleaner, definition lists

pull/43/head
eric 2017-09-28 13:25:56 -04:00
parent 467ab8a425
commit fa4573a74d
2 changed files with 77 additions and 24 deletions

View File

@ -7,7 +7,7 @@ from django.conf import settings
from urlparse import urljoin
from regluit.core import models
from regluit.core.validation import identifier_cleaner
from regluit.core.validation import identifier_cleaner, authlist_cleaner
logger = logging.getLogger(__name__)
@ -47,6 +47,10 @@ class BaseScraper(object):
logger.error(e)
self.metadata = {}
self.metadata['identifiers'] = self.identifiers
#
# utilities
#
def set(self, name, value):
self.metadata[name] = value
@ -84,6 +88,16 @@ class BaseScraper(object):
if value:
return value
return value
def get_dt_dd(self, name):
''' get the content of <dd> after a <dt> containing name'''
dt = self.doc.find('dt', string=re.compile(name))
dd = dt.find_next_sibling('dd') if dt else None
return dd.text if dd else None
#
# getters
#
def get_genre(self):
value = self.check_metas(['DC.Type', 'dc.type', 'og:type'])
@ -109,6 +123,20 @@ class BaseScraper(object):
])
self.set('description', value)
def get_isbns(self):
'''return a dict of edition keys and ISBNs'''
isbns = {}
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
for key in label_map.keys():
isbn_key = 'isbn_{}'.format(key)
value = self.check_metas(['citation_isbn'], type=label_map[key])
value = identifier_cleaner('isbn')(value)
if value:
isbns[isbn_key] = value
self.identifiers[isbn_key] = value
return isbns
def get_identifiers(self):
value = self.check_metas(['DC.Identifier.URI'])
if not value:
@ -121,17 +149,8 @@ class BaseScraper(object):
value = identifier_cleaner('doi')(value)
if value:
self.identifiers['doi'] = value
isbns = {}
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
for key in label_map.keys():
isbn_key = 'isbn_{}'.format(key)
value = self.check_metas(['citation_isbn'], type=label_map[key])
value = identifier_cleaner('isbn')(value)
if value:
isbns[isbn_key] = value
self.identifiers[isbn_key] = value
isbns = self.get_isbns()
ed_list = []
if len(isbns):
#need to create edition list
@ -178,18 +197,12 @@ class BaseScraper(object):
if not value_list:
return
creator_list = []
value_list = authlist_cleaner(value_list)
if len(value_list) == 1:
#first check if the value is really a list
auth = value_list[0]
authlist = auth.split(' and ')
if len(authlist) == 1:
self.set('creator', {'author': {'agent_name': auth}})
return
else:
value_list = authlist[0].split(',') + [authlist[1]]
self.set('creator', {'author': {'agent_name': auth.strip()}})
return
for auth in value_list:
creator_list.append({'agent_name': auth})
creator_list.append({'agent_name': auth.strip()})
self.set('creator', {'authors': creator_list })
@ -234,8 +247,10 @@ class PressbooksScraper(BaseScraper):
self.set('download_url_{}'.format(dl_type), value)
def get_publisher(self):
value = self.doc.select_one('.cie-name')
value = value.text if value else None
value = self.get_dt_dd('Publisher')
if not value:
value = self.doc.select_one('.cie-name')
value = value.text if value else None
if value:
self.set('publisher', value)
else:
@ -249,6 +264,16 @@ class PressbooksScraper(BaseScraper):
else:
super(PressbooksScraper, self).get_title()
def get_isbns(self):
'''add isbn identifiers and return a dict of edition keys and ISBNs'''
isbns = {}
for (key, label) in [('electronic', 'Ebook ISBN'), ('paper', 'Print ISBN')]:
isbn = identifier_cleaner('isbn')(self.get_dt_dd(label))
if isbn:
self.identifiers['isbn_{}'.format(key)] = isbn
isbns[key] = isbn
return isbns
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''

View File

@ -129,3 +129,31 @@ def valid_subject( subject_name ):
return False
return True
def authlist_cleaner(authlist):
''' given a author string or list of author strings, checks that the author string
is not a list of author names and that no author is repeated'''
if isinstance(authlist, str):
authlist = [authlist]
cleaned = []
for auth in authlist:
for cleaned_auth in auth_cleaner(auth):
if cleaned_auth not in cleaned:
cleaned.append(cleaned_auth)
return cleaned
# Match comma but not ", Jr"
comma_list_delim = re.compile(r',(?! *Jr[\., ])')
spaces = re.compile(r'\s+')
def auth_cleaner(auth):
''' given a author string checks that the author string
is not a list of author names'''
cleaned = []
if ';' in auth:
authlist = auth.split(';')
else:
authlist = comma_list_delim.split(auth)
for auth in authlist:
cleaned.append(spaces.sub(' ', auth.strip()))
return cleaned