Merge branch 'master' into production

pull/91/head
eric 2017-11-23 14:56:04 -05:00
commit db70284b7d
4 changed files with 43 additions and 8 deletions

View File

@ -735,7 +735,7 @@ class LookupFailure(Exception):
IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olwk'),
('gutenberg', 'gtbg'), ('isbn', 'isbn'), ('oclc', 'oclc'),
('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'),
('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'), ('edition_id', 'edid'),
]
def load_from_yaml(yaml_url, test_mode=False):
@ -836,7 +836,6 @@ class BasePandataLoader(object):
# only need to create edid if there is no edition id for the edition
new_ids.append((identifier, id_code, value))
if not work:
work = models.Work.objects.create(title=metadata.title, language=metadata.language)
if not edition:

View File

@ -101,12 +101,19 @@ class BaseScraper(object):
dd = dt.find_next_sibling('dd') if dt else None
return dd.text if dd else None
def get_itemprop(self, name):
def get_itemprop(self, name, **attrs):
value_list = []
list_mode = attrs.pop('list_mode', 'list')
attrs = {'itemprop': name}
props = self.doc.find_all(attrs=attrs)
for el in props:
value_list.append(el.text)
if list_mode == 'one_item':
return el.text if el.text else el.get('content')
else:
if el.text:
value_list.append(el.text)
elif el.has_key('content'):
value_list.append(el['content'])
return value_list
def setup(self):
@ -217,7 +224,9 @@ class BaseScraper(object):
self.set('publisher', value)
def get_pubdate(self):
value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished'])
value = self.get_itemprop('datePublished', list_mode='one_item')
if not value:
value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished'])
if value:
self.set('publication_date', value)
@ -248,7 +257,7 @@ class BaseScraper(object):
block = block if block else self.doc
img = block.find_all('img', src=CONTAINS_COVER)
if img:
cover_uri = img[0].get('src', None)
image_url = img[0].get('src', None)
if image_url:
if not image_url.startswith('http'):
image_url = urljoin(self.base, image_url)
@ -311,8 +320,13 @@ class PressbooksScraper(BaseScraper):
@classmethod
def can_scrape(cls, url):
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
'press.rebus.community','pb.unizin.org']
''' return True if the class can scrape the URL '''
return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0
for site in pb_sites:
if url.find(site) > 0:
return True
return False
class HathitrustScraper(BaseScraper):

View File

@ -0,0 +1,22 @@
from __future__ import print_function
from django.core.management.base import BaseCommand
from django.db import IntegrityError
from regluit.core import models
class Command(BaseCommand):
help = "clean work and edition titles, work descriptions, and author and publisher names"
def handle(self, **options):
for ident in models.Identifier.objects.filter(type='http', edition__isnull=False):
ident.edition = None
ident.save()
for edition in models.Edition.objects.filter(work__isnull=True):
for ident in edition.identifiers.all():
if ident.work:
edition.work = work
edition.save()
break
if not edition.work:
edition.delete()

View File

@ -42,7 +42,7 @@ OTHER_ID_CHOICES = (
('edid', 'pragmatic edition ID'),
)
WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk')
WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http')
ID_CHOICES_MAP = dict(ID_CHOICES)