From a09f3907b32979612047b11b54223eef6414f8ad Mon Sep 17 00:00:00 2001 From: eric Date: Mon, 20 Nov 2017 18:05:07 -0500 Subject: [PATCH 1/3] add pressbooks sites, improve pubdata scraper --- core/loaders/scrape.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/core/loaders/scrape.py b/core/loaders/scrape.py index a55ca25a..d2f4f666 100644 --- a/core/loaders/scrape.py +++ b/core/loaders/scrape.py @@ -101,12 +101,19 @@ class BaseScraper(object): dd = dt.find_next_sibling('dd') if dt else None return dd.text if dd else None - def get_itemprop(self, name): + def get_itemprop(self, name, **attrs): value_list = [] + list_mode = attrs.pop('list_mode', 'list') attrs = {'itemprop': name} props = self.doc.find_all(attrs=attrs) for el in props: - value_list.append(el.text) + if list_mode == 'one_item': + return el.text if el.text else el.get('content') + else: + if el.text: + value_list.append(el.text) + elif el.has_key('content'): + value_list.append(el['content']) return value_list def setup(self): @@ -217,7 +224,9 @@ class BaseScraper(object): self.set('publisher', value) def get_pubdate(self): - value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished']) + value = self.get_itemprop('datePublished', list_mode='one_item') + if not value: + value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished']) if value: self.set('publication_date', value) @@ -311,8 +320,13 @@ class PressbooksScraper(BaseScraper): @classmethod def can_scrape(cls, url): + pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks', + 'press.rebus.community','pb.unizin.org'] ''' return True if the class can scrape the URL ''' - return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0 + for site in pb_sites: + if url.find(site) > 0: + return True + return False class HathitrustScraper(BaseScraper): From 28fa60ffba3df23252a3039363782ac912c88863 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 21 Nov 2017 11:10:46 -0500 Subject: [PATCH 2/3] fix cover finding --- core/loaders/scrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/loaders/scrape.py b/core/loaders/scrape.py index d2f4f666..09671bf0 100644 --- a/core/loaders/scrape.py +++ b/core/loaders/scrape.py @@ -257,7 +257,7 @@ class BaseScraper(object): block = block if block else self.doc img = block.find_all('img', src=CONTAINS_COVER) if img: - cover_uri = img[0].get('src', None) + image_url = img[0].get('src', None) if image_url: if not image_url.startswith('http'): image_url = urljoin(self.base, image_url) From af4cac5cf87ec4e099536dbe6770886d7063d715 Mon Sep 17 00:00:00 2001 From: eric Date: Tue, 21 Nov 2017 15:47:02 -0500 Subject: [PATCH 3/3] http should be a work id --- core/bookloader.py | 3 +-- .../commands/delete_dangling_editions.py | 22 +++++++++++++++++++ core/parameters.py | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 core/management/commands/delete_dangling_editions.py diff --git a/core/bookloader.py b/core/bookloader.py index c9dc7bd3..af494392 100755 --- a/core/bookloader.py +++ b/core/bookloader.py @@ -735,7 +735,7 @@ class LookupFailure(Exception): IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olwk'), ('gutenberg', 'gtbg'), ('isbn', 'isbn'), ('oclc', 'oclc'), - ('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'), + ('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'), ('edition_id', 'edid'), ] def load_from_yaml(yaml_url, test_mode=False): @@ -836,7 +836,6 @@ class BasePandataLoader(object): # only need to create edid if there is no edition id for the edition new_ids.append((identifier, id_code, value)) - if not work: work = models.Work.objects.create(title=metadata.title, language=metadata.language) if not edition: diff --git a/core/management/commands/delete_dangling_editions.py b/core/management/commands/delete_dangling_editions.py new file mode 100644 index 00000000..2c69c341 --- /dev/null +++ b/core/management/commands/delete_dangling_editions.py @@ -0,0 +1,22 @@ +from __future__ import print_function + +from django.core.management.base import BaseCommand +from django.db import IntegrityError + +from regluit.core import models + +class Command(BaseCommand): + help = "clean work and edition titles, work descriptions, and author and publisher names" + + def handle(self, **options): + for ident in models.Identifier.objects.filter(type='http', edition__isnull=False): + ident.edition = None + ident.save() + for edition in models.Edition.objects.filter(work__isnull=True): + for ident in edition.identifiers.all(): + if ident.work: + edition.work = work + edition.save() + break + if not edition.work: + edition.delete() \ No newline at end of file diff --git a/core/parameters.py b/core/parameters.py index 29141e3a..4f796e53 100644 --- a/core/parameters.py +++ b/core/parameters.py @@ -42,7 +42,7 @@ OTHER_ID_CHOICES = ( ('edid', 'pragmatic edition ID'), ) -WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk') +WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http') ID_CHOICES_MAP = dict(ID_CHOICES)