Merge branch 'master' into production

2017-11-23 14:56:04 -05:00 · 2017-11-23 14:56:04 -05:00 · db70284b7d
parent 10148e2708 b3e320fcb4
commit db70284b7d
4 changed files with 43 additions and 8 deletions
--- a/core/bookloader.py
+++ b/core/bookloader.py
@ -735,7 +735,7 @@ class LookupFailure(Exception):

 IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olwk'),
    ('gutenberg', 'gtbg'), ('isbn', 'isbn'), ('oclc', 'oclc'),
-    ('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'),
+    ('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'), ('edition_id', 'edid'),
 ]

 def load_from_yaml(yaml_url, test_mode=False):
@ -836,7 +836,6 @@ class BasePandataLoader(object):
                        # only need to create edid if there is no edition id for the edition
                        new_ids.append((identifier, id_code, value))

-
        if not work:
            work = models.Work.objects.create(title=metadata.title, language=metadata.language)
        if not edition:
--- a/core/loaders/scrape.py
+++ b/core/loaders/scrape.py
@ -101,12 +101,19 @@ class BaseScraper(object):
        dd = dt.find_next_sibling('dd') if dt else None
        return dd.text if dd else None
  
-    def get_itemprop(self, name):
+    def get_itemprop(self, name, **attrs):
        value_list = []
+        list_mode = attrs.pop('list_mode', 'list')
        attrs = {'itemprop': name}
        props = self.doc.find_all(attrs=attrs)
        for el in props:
-            value_list.append(el.text)
+            if list_mode == 'one_item':
+                return el.text if el.text else el.get('content')
+            else:
+                if el.text:
+                    value_list.append(el.text)
+                elif el.has_key('content'):
+                    value_list.append(el['content'])
        return value_list

    def setup(self):
@ -217,7 +224,9 @@ class BaseScraper(object):
            self.set('publisher', value)

    def get_pubdate(self):
-        value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished'])
+        value = self.get_itemprop('datePublished', list_mode='one_item')
+        if not value:
+            value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished'])
        if value:
            self.set('publication_date', value)

@ -248,7 +257,7 @@ class BaseScraper(object):
            block = block if block else self.doc
            img = block.find_all('img', src=CONTAINS_COVER)
            if img:
-                cover_uri = img[0].get('src', None)
+                image_url = img[0].get('src', None)
        if image_url:
            if not image_url.startswith('http'):
                image_url = urljoin(self.base, image_url)
@ -311,8 +320,13 @@ class PressbooksScraper(BaseScraper):
                
    @classmethod
    def can_scrape(cls, url):
+        pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks', 
+            'press.rebus.community','pb.unizin.org']
        ''' return True if the class can scrape the URL '''
-        return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0
+        for site in pb_sites:
+            if url.find(site) > 0:
+                return True
+        return False


 class HathitrustScraper(BaseScraper):
--- a/core/management/commands/delete_dangling_editions.py
+++ b/core/management/commands/delete_dangling_editions.py
@ -0,0 +1,22 @@
+from __future__ import print_function
+
+from django.core.management.base import BaseCommand
+from django.db import IntegrityError
+
+from regluit.core import models
+
+class Command(BaseCommand):
+    help = "clean work and edition titles, work descriptions, and author and publisher names"
+    
+    def handle(self, **options):
+        for ident in models.Identifier.objects.filter(type='http', edition__isnull=False):
+            ident.edition = None
+            ident.save()
+        for edition in models.Edition.objects.filter(work__isnull=True):
+            for ident in edition.identifiers.all():
+                if ident.work:
+                    edition.work = work
+                    edition.save()
+                    break
+            if not edition.work:
+                edition.delete()
--- a/core/parameters.py
+++ b/core/parameters.py
@ -42,7 +42,7 @@ OTHER_ID_CHOICES = (
    ('edid', 'pragmatic edition ID'),
 )

-WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk')
+WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http')

 ID_CHOICES_MAP = dict(ID_CHOICES)