authlist cleaner, definition lists

2017-09-28 13:25:56 -04:00 · 2017-09-28 13:25:56 -04:00 · fa4573a74d
parent 467ab8a425
commit fa4573a74d
2 changed files with 77 additions and 24 deletions
--- a/core/loaders/scrape.py
+++ b/core/loaders/scrape.py
@ -7,7 +7,7 @@ from django.conf import settings
 from urlparse import urljoin

 from regluit.core import models
-from regluit.core.validation import identifier_cleaner
+from regluit.core.validation import identifier_cleaner, authlist_cleaner

 logger = logging.getLogger(__name__)

@ -47,6 +47,10 @@ class BaseScraper(object):
            logger.error(e)
            self.metadata = {}
        self.metadata['identifiers'] = self.identifiers
+
+    #
+    # utilities
+    #
    
    def set(self, name, value):
        self.metadata[name] = value
@ -84,6 +88,16 @@ class BaseScraper(object):
            if value:
                return value
        return value 
+    
+    def get_dt_dd(self, name):
+        ''' get the content of <dd> after a <dt> containing name'''
+        dt = self.doc.find('dt', string=re.compile(name))
+        dd = dt.find_next_sibling('dd') if dt else None
+        return dd.text if dd else None
+
+    #
+    # getters
+    #

    def get_genre(self):
        value = self.check_metas(['DC.Type', 'dc.type', 'og:type'])
@ -109,6 +123,20 @@ class BaseScraper(object):
        ])
        self.set('description',  value)

+    def get_isbns(self):
+        '''return a dict of edition keys and ISBNs'''
+        isbns = {}
+        label_map = {'epub': 'EPUB', 'mobi': 'Mobi', 
+            'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
+        for key in label_map.keys():
+            isbn_key = 'isbn_{}'.format(key)
+            value = self.check_metas(['citation_isbn'], type=label_map[key])
+            value = identifier_cleaner('isbn')(value)
+            if value:
+                isbns[isbn_key] = value
+                self.identifiers[isbn_key] = value
+        return isbns
+
    def get_identifiers(self):
        value = self.check_metas(['DC.Identifier.URI'])
        if not value:
@ -121,17 +149,8 @@ class BaseScraper(object):
        value = identifier_cleaner('doi')(value)
        if value:
            self.identifiers['doi'] = value
-        isbns = {}
-        label_map = {'epub': 'EPUB', 'mobi': 'Mobi', 
-            'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
-        for key in label_map.keys():
-            isbn_key = 'isbn_{}'.format(key)
-            value = self.check_metas(['citation_isbn'], type=label_map[key])
-            value = identifier_cleaner('isbn')(value)
-            if value:
-                isbns[isbn_key] = value
-                self.identifiers[isbn_key] = value
-                
+
+        isbns = self.get_isbns()                
        ed_list = []
        if len(isbns):
            #need to create edition list
@ -178,18 +197,12 @@ class BaseScraper(object):
        if not value_list:
            return
        creator_list = []
+        value_list = authlist_cleaner(value_list)
        if len(value_list) == 1:
-            #first check if the value is really a list
-            auth = value_list[0]
-            authlist = auth.split(' and ')
-            if len(authlist) == 1:
-                self.set('creator',  {'author': {'agent_name': auth}})
-                return
-            else:
-                value_list = authlist[0].split(',') + [authlist[1]]
-
+            self.set('creator',  {'author': {'agent_name': auth.strip()}})
+            return
        for auth in value_list: 
-             creator_list.append({'agent_name': auth})
+             creator_list.append({'agent_name': auth.strip()})

        self.set('creator', {'authors': creator_list })
    
@ -234,8 +247,10 @@ class PressbooksScraper(BaseScraper):
                    self.set('download_url_{}'.format(dl_type), value)

    def get_publisher(self):
-        value = self.doc.select_one('.cie-name')
-        value = value.text if value else None
+        value = self.get_dt_dd('Publisher')
+        if not value:
+            value = self.doc.select_one('.cie-name')
+            value = value.text if value else None
        if value:
            self.set('publisher', value)
        else:
@ -249,6 +264,16 @@ class PressbooksScraper(BaseScraper):
        else:
            super(PressbooksScraper, self).get_title()

+    def get_isbns(self):
+        '''add isbn identifiers and return a dict of edition keys and ISBNs'''
+        isbns = {}
+        for (key, label) in [('electronic', 'Ebook ISBN'), ('paper', 'Print ISBN')]:
+            isbn = identifier_cleaner('isbn')(self.get_dt_dd(label))
+            if isbn:
+                self.identifiers['isbn_{}'.format(key)] = isbn
+                isbns[key] = isbn
+        return isbns
+                
    @classmethod
    def can_scrape(cls, url):
        ''' return True if the class can scrape the URL '''
--- a/core/validation.py
+++ b/core/validation.py
@ -129,3 +129,31 @@ def valid_subject( subject_name ):
                return False
    return True

+def authlist_cleaner(authlist):
+    ''' given a author string or list of author strings, checks that the author string
+        is not a list of author names and that no author is repeated'''
+    if isinstance(authlist, str):
+        authlist = [authlist]
+    cleaned = []
+    for auth in authlist:
+        for cleaned_auth in auth_cleaner(auth):
+            if cleaned_auth not in cleaned:
+                cleaned.append(cleaned_auth)
+    return cleaned
+
+# Match comma but not ", Jr"
+comma_list_delim = re.compile(r',(?! *Jr[\., ])')
+spaces = re.compile(r'\s+')
+
+def auth_cleaner(auth):
+    ''' given a author string checks that the author string
+        is not a list of author names'''
+    cleaned = []
+
+    if ';' in auth:
+        authlist =  auth.split(';')
+    else:
+        authlist = comma_list_delim.split(auth)
+    for auth in authlist:
+        cleaned.append(spaces.sub(' ', auth.strip()))
+    return cleaned