commit
d5cf83bc89
|
@ -38,7 +38,7 @@ from . import cc
|
||||||
from . import models
|
from . import models
|
||||||
from .parameters import WORK_IDENTIFIERS
|
from .parameters import WORK_IDENTIFIERS
|
||||||
from .validation import identifier_cleaner
|
from .validation import identifier_cleaner
|
||||||
from .loaders.scrape import BaseScraper, scrape_sitemap
|
from .loaders.scrape import get_scraper, scrape_sitemap
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
request_log = logging.getLogger("requests")
|
request_log = logging.getLogger("requests")
|
||||||
|
@ -1023,7 +1023,7 @@ def ebooks_in_github_release(repo_owner, repo_name, tag, token=None):
|
||||||
|
|
||||||
def add_by_webpage(url, work=None, user=None):
|
def add_by_webpage(url, work=None, user=None):
|
||||||
edition = None
|
edition = None
|
||||||
scraper = BaseScraper(url)
|
scraper = get_scraper(url)
|
||||||
loader = BasePandataLoader(url)
|
loader = BasePandataLoader(url)
|
||||||
pandata = Pandata()
|
pandata = Pandata()
|
||||||
pandata.metadata = scraper.metadata
|
pandata.metadata = scraper.metadata
|
||||||
|
@ -1035,7 +1035,6 @@ def add_by_webpage(url, work=None, user=None):
|
||||||
|
|
||||||
def add_by_sitemap(url, maxnum=None):
|
def add_by_sitemap(url, maxnum=None):
|
||||||
editions = []
|
editions = []
|
||||||
scraper = BaseScraper(url)
|
|
||||||
for bookdata in scrape_sitemap(url, maxnum=maxnum):
|
for bookdata in scrape_sitemap(url, maxnum=maxnum):
|
||||||
edition = work = None
|
edition = work = None
|
||||||
loader = BasePandataLoader(bookdata.base)
|
loader = BasePandataLoader(bookdata.base)
|
||||||
|
|
|
@ -7,14 +7,14 @@ from django.conf import settings
|
||||||
from urlparse import urljoin
|
from urlparse import urljoin
|
||||||
|
|
||||||
from regluit.core import models
|
from regluit.core import models
|
||||||
from regluit.core.validation import identifier_cleaner
|
from regluit.core.validation import identifier_cleaner, authlist_cleaner
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
CONTAINS_COVER = re.compile('cover')
|
CONTAINS_COVER = re.compile('cover')
|
||||||
CONTAINS_CC = re.compile('creativecommons.org')
|
CONTAINS_CC = re.compile('creativecommons.org')
|
||||||
|
|
||||||
class BaseScraper(object):
|
class BaseScraper(object):
|
||||||
'''
|
'''
|
||||||
designed to make at least a decent gues for webpages that embed metadata
|
designed to make at least a decent gues for webpages that embed metadata
|
||||||
'''
|
'''
|
||||||
|
@ -47,6 +47,10 @@ class BaseScraper(object):
|
||||||
logger.error(e)
|
logger.error(e)
|
||||||
self.metadata = {}
|
self.metadata = {}
|
||||||
self.metadata['identifiers'] = self.identifiers
|
self.metadata['identifiers'] = self.identifiers
|
||||||
|
|
||||||
|
#
|
||||||
|
# utilities
|
||||||
|
#
|
||||||
|
|
||||||
def set(self, name, value):
|
def set(self, name, value):
|
||||||
self.metadata[name] = value
|
self.metadata[name] = value
|
||||||
|
@ -65,6 +69,12 @@ class BaseScraper(object):
|
||||||
attrs['name'] = meta_name
|
attrs['name'] = meta_name
|
||||||
|
|
||||||
metas = self.doc.find_all('meta', attrs=attrs)
|
metas = self.doc.find_all('meta', attrs=attrs)
|
||||||
|
if len(metas) == 0:
|
||||||
|
# some sites put schema.org metadata in metas
|
||||||
|
del(attrs['name'])
|
||||||
|
attrs['itemprop'] = meta_name
|
||||||
|
metas = self.doc.find_all('meta', attrs=attrs)
|
||||||
|
del(attrs['itemprop'])
|
||||||
for meta in metas:
|
for meta in metas:
|
||||||
el_value = meta.get('content', '').strip()
|
el_value = meta.get('content', '').strip()
|
||||||
if list_mode == 'longest':
|
if list_mode == 'longest':
|
||||||
|
@ -78,6 +88,16 @@ class BaseScraper(object):
|
||||||
if value:
|
if value:
|
||||||
return value
|
return value
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
def get_dt_dd(self, name):
|
||||||
|
''' get the content of <dd> after a <dt> containing name'''
|
||||||
|
dt = self.doc.find('dt', string=re.compile(name))
|
||||||
|
dd = dt.find_next_sibling('dd') if dt else None
|
||||||
|
return dd.text if dd else None
|
||||||
|
|
||||||
|
#
|
||||||
|
# getters
|
||||||
|
#
|
||||||
|
|
||||||
def get_genre(self):
|
def get_genre(self):
|
||||||
value = self.check_metas(['DC.Type', 'dc.type', 'og:type'])
|
value = self.check_metas(['DC.Type', 'dc.type', 'og:type'])
|
||||||
|
@ -91,7 +111,7 @@ class BaseScraper(object):
|
||||||
self.set('title', value)
|
self.set('title', value)
|
||||||
|
|
||||||
def get_language(self):
|
def get_language(self):
|
||||||
value = self.check_metas(['DC.Language', 'dc.language', 'language'])
|
value = self.check_metas(['DC.Language', 'dc.language', 'language', 'inLanguage'])
|
||||||
self.set('language', value)
|
self.set('language', value)
|
||||||
|
|
||||||
def get_description(self):
|
def get_description(self):
|
||||||
|
@ -103,15 +123,8 @@ class BaseScraper(object):
|
||||||
])
|
])
|
||||||
self.set('description', value)
|
self.set('description', value)
|
||||||
|
|
||||||
def get_identifiers(self):
|
def get_isbns(self):
|
||||||
value = self.check_metas(['DC.Identifier.URI'])
|
'''return a dict of edition keys and ISBNs'''
|
||||||
value = identifier_cleaner('http')(value)
|
|
||||||
if value:
|
|
||||||
self.identifiers['http'] = value
|
|
||||||
value = self.check_metas(['DC.Identifier.DOI', 'citation_doi'])
|
|
||||||
value = identifier_cleaner('doi')(value)
|
|
||||||
if value:
|
|
||||||
self.identifiers['doi'] = value
|
|
||||||
isbns = {}
|
isbns = {}
|
||||||
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
|
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
|
||||||
'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
|
'paper': 'Paperback', 'pdf':'PDF', 'hard':'Hardback'}
|
||||||
|
@ -122,7 +135,22 @@ class BaseScraper(object):
|
||||||
if value:
|
if value:
|
||||||
isbns[isbn_key] = value
|
isbns[isbn_key] = value
|
||||||
self.identifiers[isbn_key] = value
|
self.identifiers[isbn_key] = value
|
||||||
|
return isbns
|
||||||
|
|
||||||
|
def get_identifiers(self):
|
||||||
|
value = self.check_metas(['DC.Identifier.URI'])
|
||||||
|
if not value:
|
||||||
|
value = self.doc.select_one('link[rel=canonical]')
|
||||||
|
value = value['href'] if value else None
|
||||||
|
value = identifier_cleaner('http')(value)
|
||||||
|
if value:
|
||||||
|
self.identifiers['http'] = value
|
||||||
|
value = self.check_metas(['DC.Identifier.DOI', 'citation_doi'])
|
||||||
|
value = identifier_cleaner('doi')(value)
|
||||||
|
if value:
|
||||||
|
self.identifiers['doi'] = value
|
||||||
|
|
||||||
|
isbns = self.get_isbns()
|
||||||
ed_list = []
|
ed_list = []
|
||||||
if len(isbns):
|
if len(isbns):
|
||||||
#need to create edition list
|
#need to create edition list
|
||||||
|
@ -156,7 +184,7 @@ class BaseScraper(object):
|
||||||
self.set('publisher', value)
|
self.set('publisher', value)
|
||||||
|
|
||||||
def get_pubdate(self):
|
def get_pubdate(self):
|
||||||
value = self.check_metas(['citation_publication_date', 'DC.Date.issued'])
|
value = self.check_metas(['citation_publication_date', 'DC.Date.issued', 'datePublished'])
|
||||||
if value:
|
if value:
|
||||||
self.set('publication_date', value)
|
self.set('publication_date', value)
|
||||||
|
|
||||||
|
@ -164,21 +192,22 @@ class BaseScraper(object):
|
||||||
value_list = self.check_metas([
|
value_list = self.check_metas([
|
||||||
'DC.Creator.PersonalName',
|
'DC.Creator.PersonalName',
|
||||||
'citation_author',
|
'citation_author',
|
||||||
|
'author',
|
||||||
], list_mode='list')
|
], list_mode='list')
|
||||||
if not value_list:
|
if not value_list:
|
||||||
return
|
return
|
||||||
|
creator_list = []
|
||||||
|
value_list = authlist_cleaner(value_list)
|
||||||
if len(value_list) == 1:
|
if len(value_list) == 1:
|
||||||
creator = {'author': {'agent_name': value_list[0]}}
|
self.set('creator', {'author': {'agent_name': auth.strip()}})
|
||||||
else:
|
return
|
||||||
creator_list = []
|
for auth in value_list:
|
||||||
for auth in value_list:
|
creator_list.append({'agent_name': auth.strip()})
|
||||||
creator_list.append({'agent_name': auth})
|
|
||||||
creator = {'authors': creator_list }
|
self.set('creator', {'authors': creator_list })
|
||||||
|
|
||||||
self.set('creator', creator)
|
|
||||||
|
|
||||||
def get_cover(self):
|
def get_cover(self):
|
||||||
image_url = self.check_metas(['og.image'])
|
image_url = self.check_metas(['og.image', 'image'])
|
||||||
if not image_url:
|
if not image_url:
|
||||||
block = self.doc.find(class_=CONTAINS_COVER)
|
block = self.doc.find(class_=CONTAINS_COVER)
|
||||||
block = block if block else self.doc
|
block = block if block else self.doc
|
||||||
|
@ -203,12 +232,65 @@ class BaseScraper(object):
|
||||||
for link in links:
|
for link in links:
|
||||||
self.set('rights_url', link['href'])
|
self.set('rights_url', link['href'])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def can_scrape(cls, url):
|
||||||
|
''' return True if the class can scrape the URL '''
|
||||||
|
return True
|
||||||
|
|
||||||
|
class PressbooksScraper(BaseScraper):
|
||||||
|
def get_downloads(self):
|
||||||
|
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||||
|
download_el = self.doc.select_one('.{}'.format(dl_type))
|
||||||
|
if download_el and download_el.find_parent():
|
||||||
|
value = download_el.find_parent().get('href')
|
||||||
|
if value:
|
||||||
|
self.set('download_url_{}'.format(dl_type), value)
|
||||||
|
|
||||||
|
def get_publisher(self):
|
||||||
|
value = self.get_dt_dd('Publisher')
|
||||||
|
if not value:
|
||||||
|
value = self.doc.select_one('.cie-name')
|
||||||
|
value = value.text if value else None
|
||||||
|
if value:
|
||||||
|
self.set('publisher', value)
|
||||||
|
else:
|
||||||
|
super(PressbooksScraper, self).get_publisher()
|
||||||
|
|
||||||
|
def get_title(self):
|
||||||
|
value = self.doc.select_one('.entry-title a[title]')
|
||||||
|
value = value['title'] if value else None
|
||||||
|
if value:
|
||||||
|
self.set('title', value)
|
||||||
|
else:
|
||||||
|
super(PressbooksScraper, self).get_title()
|
||||||
|
|
||||||
|
def get_isbns(self):
|
||||||
|
'''add isbn identifiers and return a dict of edition keys and ISBNs'''
|
||||||
|
isbns = {}
|
||||||
|
for (key, label) in [('electronic', 'Ebook ISBN'), ('paper', 'Print ISBN')]:
|
||||||
|
isbn = identifier_cleaner('isbn')(self.get_dt_dd(label))
|
||||||
|
if isbn:
|
||||||
|
self.identifiers['isbn_{}'.format(key)] = isbn
|
||||||
|
isbns[key] = isbn
|
||||||
|
return isbns
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def can_scrape(cls, url):
|
||||||
|
''' return True if the class can scrape the URL '''
|
||||||
|
return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0
|
||||||
|
|
||||||
|
def get_scraper(url):
|
||||||
|
scrapers = [PressbooksScraper, BaseScraper]
|
||||||
|
for scraper in scrapers:
|
||||||
|
if scraper.can_scrape(url):
|
||||||
|
return scraper(url)
|
||||||
|
|
||||||
def scrape_sitemap(url, maxnum=None):
|
def scrape_sitemap(url, maxnum=None):
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
|
||||||
doc = BeautifulSoup(response.content, 'lxml')
|
doc = BeautifulSoup(response.content, 'lxml')
|
||||||
for page in doc.find_all('loc')[0:maxnum]:
|
for page in doc.find_all('loc')[0:maxnum]:
|
||||||
scraper = BaseScraper(page.text)
|
scraper = get_scraper(page.text)
|
||||||
if scraper.metadata.get('genre', None) == 'book':
|
if scraper.metadata.get('genre', None) == 'book':
|
||||||
yield scraper
|
yield scraper
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
|
|
|
@ -129,3 +129,31 @@ def valid_subject( subject_name ):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def authlist_cleaner(authlist):
|
||||||
|
''' given a author string or list of author strings, checks that the author string
|
||||||
|
is not a list of author names and that no author is repeated'''
|
||||||
|
if isinstance(authlist, str):
|
||||||
|
authlist = [authlist]
|
||||||
|
cleaned = []
|
||||||
|
for auth in authlist:
|
||||||
|
for cleaned_auth in auth_cleaner(auth):
|
||||||
|
if cleaned_auth not in cleaned:
|
||||||
|
cleaned.append(cleaned_auth)
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
# Match comma but not ", Jr"
|
||||||
|
comma_list_delim = re.compile(r',(?! *Jr[\., ])')
|
||||||
|
spaces = re.compile(r'\s+')
|
||||||
|
|
||||||
|
def auth_cleaner(auth):
|
||||||
|
''' given a author string checks that the author string
|
||||||
|
is not a list of author names'''
|
||||||
|
cleaned = []
|
||||||
|
|
||||||
|
if ';' in auth:
|
||||||
|
authlist = auth.split(';')
|
||||||
|
else:
|
||||||
|
authlist = comma_list_delim.split(auth)
|
||||||
|
for auth in authlist:
|
||||||
|
cleaned.append(spaces.sub(' ', auth.strip()))
|
||||||
|
return cleaned
|
||||||
|
|
Loading…
Reference in New Issue