Merge branch 'master' into production
commit
db70284b7d
|
@ -735,7 +735,7 @@ class LookupFailure(Exception):
|
|||
|
||||
IDTABLE = [('librarything', 'ltwk'), ('goodreads', 'gdrd'), ('openlibrary', 'olwk'),
|
||||
('gutenberg', 'gtbg'), ('isbn', 'isbn'), ('oclc', 'oclc'),
|
||||
('edition_id', 'edid'), ('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'),
|
||||
('googlebooks', 'goog'), ('doi', 'doi'), ('http','http'), ('edition_id', 'edid'),
|
||||
]
|
||||
|
||||
def load_from_yaml(yaml_url, test_mode=False):
|
||||
|
@ -836,7 +836,6 @@ class BasePandataLoader(object):
|
|||
# only need to create edid if there is no edition id for the edition
|
||||
new_ids.append((identifier, id_code, value))
|
||||
|
||||
|
||||
if not work:
|
||||
work = models.Work.objects.create(title=metadata.title, language=metadata.language)
|
||||
if not edition:
|
||||
|
|
|
@ -101,12 +101,19 @@ class BaseScraper(object):
|
|||
dd = dt.find_next_sibling('dd') if dt else None
|
||||
return dd.text if dd else None
|
||||
|
||||
def get_itemprop(self, name):
|
||||
def get_itemprop(self, name, **attrs):
|
||||
value_list = []
|
||||
list_mode = attrs.pop('list_mode', 'list')
|
||||
attrs = {'itemprop': name}
|
||||
props = self.doc.find_all(attrs=attrs)
|
||||
for el in props:
|
||||
value_list.append(el.text)
|
||||
if list_mode == 'one_item':
|
||||
return el.text if el.text else el.get('content')
|
||||
else:
|
||||
if el.text:
|
||||
value_list.append(el.text)
|
||||
elif el.has_key('content'):
|
||||
value_list.append(el['content'])
|
||||
return value_list
|
||||
|
||||
def setup(self):
|
||||
|
@ -217,7 +224,9 @@ class BaseScraper(object):
|
|||
self.set('publisher', value)
|
||||
|
||||
def get_pubdate(self):
|
||||
value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished'])
|
||||
value = self.get_itemprop('datePublished', list_mode='one_item')
|
||||
if not value:
|
||||
value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished'])
|
||||
if value:
|
||||
self.set('publication_date', value)
|
||||
|
||||
|
@ -248,7 +257,7 @@ class BaseScraper(object):
|
|||
block = block if block else self.doc
|
||||
img = block.find_all('img', src=CONTAINS_COVER)
|
||||
if img:
|
||||
cover_uri = img[0].get('src', None)
|
||||
image_url = img[0].get('src', None)
|
||||
if image_url:
|
||||
if not image_url.startswith('http'):
|
||||
image_url = urljoin(self.base, image_url)
|
||||
|
@ -311,8 +320,13 @@ class PressbooksScraper(BaseScraper):
|
|||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
|
||||
'press.rebus.community','pb.unizin.org']
|
||||
''' return True if the class can scrape the URL '''
|
||||
return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0
|
||||
for site in pb_sites:
|
||||
if url.find(site) > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class HathitrustScraper(BaseScraper):
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
from __future__ import print_function
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import IntegrityError
|
||||
|
||||
from regluit.core import models
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "clean work and edition titles, work descriptions, and author and publisher names"
|
||||
|
||||
def handle(self, **options):
|
||||
for ident in models.Identifier.objects.filter(type='http', edition__isnull=False):
|
||||
ident.edition = None
|
||||
ident.save()
|
||||
for edition in models.Edition.objects.filter(work__isnull=True):
|
||||
for ident in edition.identifiers.all():
|
||||
if ident.work:
|
||||
edition.work = work
|
||||
edition.save()
|
||||
break
|
||||
if not edition.work:
|
||||
edition.delete()
|
|
@ -42,7 +42,7 @@ OTHER_ID_CHOICES = (
|
|||
('edid', 'pragmatic edition ID'),
|
||||
)
|
||||
|
||||
WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk')
|
||||
WORK_IDENTIFIERS = ('doi','olwk','glue','ltwk', 'http')
|
||||
|
||||
ID_CHOICES_MAP = dict(ID_CHOICES)
|
||||
|
||||
|
|
Loading…
Reference in New Issue