commit
fdb0aabe36
|
@ -8,9 +8,10 @@ from gitenberg.metadata.pandata import Pandata
|
|||
from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader
|
||||
from .scrape import PressbooksScraper, HathitrustScraper, BaseScraper
|
||||
from .springer import SpringerScraper
|
||||
from .ubiquity import UbiquityScraper
|
||||
|
||||
def get_scraper(url):
|
||||
scrapers = [PressbooksScraper, HathitrustScraper, SpringerScraper, BaseScraper]
|
||||
scrapers = [PressbooksScraper, HathitrustScraper, SpringerScraper, UbiquityScraper, BaseScraper]
|
||||
for scraper in scrapers:
|
||||
if scraper.can_scrape(url):
|
||||
return scraper(url)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import re
|
||||
import logging
|
||||
from urlparse import urlparse
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
#from gitenberg.metadata.pandata import Pandata
|
||||
|
@ -18,8 +19,27 @@ CONTAINS_OCLCNUM = re.compile('worldcat.org/oclc/(\d+)')
|
|||
|
||||
class BaseScraper(object):
|
||||
'''
|
||||
designed to make at least a decent gues for webpages that embed metadata
|
||||
designed to make at least a decent guess for webpages that embed metadata
|
||||
'''
|
||||
can_scrape_hosts = False
|
||||
can_scrape_strings = [''] #should always return true
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
if not (cls.can_scrape_hosts or cls.can_scrape_strings):
|
||||
return True
|
||||
if cls.can_scrape_hosts:
|
||||
urlhost = urlparse(url).hostname
|
||||
if urlhost:
|
||||
for host in cls.can_scrape_hosts:
|
||||
if urlhost.endswith(host):
|
||||
return True
|
||||
if cls.can_scrape_strings:
|
||||
for pass_str in cls.can_scrape_strings:
|
||||
if url.find(pass_str) >= 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __init__(self, url):
|
||||
self.metadata = {}
|
||||
self.identifiers = {'http': url}
|
||||
|
@ -286,12 +306,12 @@ class BaseScraper(object):
|
|||
for link in links:
|
||||
self.set('rights_url', link['href'])
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return True
|
||||
|
||||
class PressbooksScraper(BaseScraper):
|
||||
can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu',
|
||||
'press.rebus.community', 'pb.unizin.org']
|
||||
can_scrape_strings = ['pressbooks']
|
||||
|
||||
def get_downloads(self):
|
||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||
download_el = self.doc.select_one('.{}'.format(dl_type))
|
||||
|
@ -328,19 +348,12 @@ class PressbooksScraper(BaseScraper):
|
|||
isbns[key] = isbn
|
||||
return isbns
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
|
||||
'press.rebus.community','pb.unizin.org']
|
||||
''' return True if the class can scrape the URL '''
|
||||
for site in pb_sites:
|
||||
if url.find(site) > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class HathitrustScraper(BaseScraper):
|
||||
|
||||
can_scrape_hosts = ['hathitrust.org']
|
||||
can_scrape_strings = ['hdl.handle.net/2027/']
|
||||
CATALOG = re.compile(r'catalog.hathitrust.org/Record/(\d+)')
|
||||
|
||||
def setup(self):
|
||||
|
@ -388,8 +401,3 @@ class HathitrustScraper(BaseScraper):
|
|||
|
||||
def get_genre(self):
|
||||
self.set('genre', self.record.get('type_of_reference', '').lower())
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return url.find('hathitrust.org') > 0 or url.find('hdl.handle.net/2027/') > 0
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import re
|
||||
import requests
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from urlparse import urljoin
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from regluit.core.validation import identifier_cleaner
|
||||
|
@ -14,11 +15,12 @@ MENTIONS_CC = re.compile(r'CC BY(-NC)?(-ND|-SA)?', flags=re.I)
|
|||
HAS_YEAR = re.compile(r'(19|20)\d\d')
|
||||
|
||||
class SpringerScraper(BaseScraper):
|
||||
can_scrape_strings =['10.1007', '10.1057']
|
||||
def get_downloads(self):
|
||||
for dl_type in ['epub', 'mobi', 'pdf']:
|
||||
download_el = self.doc.find('a', title=re.compile(dl_type.upper()))
|
||||
if download_el:
|
||||
value = download_el.get('href')
|
||||
value = download_el.get('href')
|
||||
if value:
|
||||
value = urljoin(self.base, value)
|
||||
self.set('download_url_{}'.format(dl_type), value)
|
||||
|
@ -31,7 +33,7 @@ class SpringerScraper(BaseScraper):
|
|||
text = div.get_text() if hasattr(div, 'get_text') else div.string
|
||||
if text:
|
||||
text = text.replace(u'\xa0', u' ')
|
||||
value = u'{}<p>{}</p>'.format(value, text)
|
||||
value = u'{}<p>{}</p>'.format(value, text)
|
||||
self.set('description', value)
|
||||
|
||||
def get_keywords(self):
|
||||
|
@ -42,7 +44,7 @@ class SpringerScraper(BaseScraper):
|
|||
if 'Open Access' in value:
|
||||
value.remove('Open Access')
|
||||
self.set('subjects', value)
|
||||
|
||||
|
||||
def get_identifiers(self):
|
||||
super(SpringerScraper, self).get_identifiers()
|
||||
el = self.doc.select_one('#doi-url')
|
||||
|
@ -64,7 +66,7 @@ class SpringerScraper(BaseScraper):
|
|||
if value:
|
||||
isbns['electronic'] = value
|
||||
return isbns
|
||||
|
||||
|
||||
def get_title(self):
|
||||
el = self.doc.select_one('#book-title')
|
||||
value = ''
|
||||
|
@ -75,7 +77,7 @@ class SpringerScraper(BaseScraper):
|
|||
self.set('title', value)
|
||||
if not value:
|
||||
super(SpringerScraper, self).get_title()
|
||||
|
||||
|
||||
def get_role(self):
|
||||
if self.doc.select_one('#editors'):
|
||||
return 'editor'
|
||||
|
@ -84,19 +86,19 @@ class SpringerScraper(BaseScraper):
|
|||
def get_author_list(self):
|
||||
for el in self.doc.select('.authors__name'):
|
||||
yield el.text.strip().replace(u'\xa0', u' ')
|
||||
|
||||
|
||||
def get_license(self):
|
||||
'''only looks for cc licenses'''
|
||||
links = self.doc.find_all(href=CONTAINS_CC)
|
||||
for link in links:
|
||||
self.set('rights_url', link['href'])
|
||||
return
|
||||
mention = self.doc.find(string=MENTIONS_CC)
|
||||
mention = self.doc.find(string=MENTIONS_CC)
|
||||
if mention:
|
||||
lic = MENTIONS_CC.search(mention).group(0)
|
||||
lic_url = 'https://creativecommons.org/licenses/{}/'.format(lic[3:].lower())
|
||||
self.set('rights_url', lic_url)
|
||||
|
||||
|
||||
def get_pubdate(self):
|
||||
pubinfo = self.doc.select_one('#copyright-info')
|
||||
if pubinfo:
|
||||
|
@ -107,12 +109,6 @@ class SpringerScraper(BaseScraper):
|
|||
def get_publisher(self):
|
||||
self.set('publisher', 'Springer')
|
||||
|
||||
@classmethod
|
||||
def can_scrape(cls, url):
|
||||
''' return True if the class can scrape the URL '''
|
||||
return url.find('10.1007') >= 0 or url.find('10.1057') >= 0
|
||||
|
||||
|
||||
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
|
||||
def load_springer(num_pages):
|
||||
def springer_open_books(num_pages):
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
import re
|
||||
from urlparse import urlparse
|
||||
|
||||
from regluit.utils.lang import get_language_code
|
||||
from . import BaseScraper
|
||||
|
||||
|
||||
HAS_EDS = re.compile(r'\(eds?\.\)')
|
||||
UBIQUITY_HOSTS = ["ubiquitypress.com", "kriterium.se", "oa.finlit.fi", "humanities-map.net",
|
||||
"oa.psupress.org", "larcommons.net", "uwestminsterpress.co.uk", "stockholmuniversitypress.se",
|
||||
"luminosoa.org",
|
||||
]
|
||||
|
||||
class UbiquityScraper(BaseScraper):
|
||||
can_scrape_hosts = UBIQUITY_HOSTS
|
||||
def get_role(self):
|
||||
descs = self.doc.select('section.book-description')
|
||||
for desc in descs:
|
||||
if desc.find(string=HAS_EDS):
|
||||
return 'editor'
|
||||
return super(self, UbiquityScraper).get_role()
|
||||
|
||||
def get_language(self):
|
||||
langlabel = self.doc.find(string='Language')
|
||||
lang = langlabel.parent.parent.find_next_sibling()
|
||||
lang = lang.get_text() if lang else ''
|
||||
lang = get_language_code(lang) if lang else ''
|
||||
if lang:
|
||||
self.set('language', lang)
|
||||
else:
|
||||
super(self, UbiquityScraper).get_language()
|
|
@ -23,7 +23,7 @@ class Command(BaseCommand):
|
|||
ebf = ebook.get_archive_ebf()
|
||||
if ebf:
|
||||
try:
|
||||
print 'making mobi for {}'.format(work.title)
|
||||
print u'making mobi for {}'.format(work.title)
|
||||
if ebf.make_mobi():
|
||||
print 'made mobi'
|
||||
i = i + 1
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
from django.conf.global_settings import LANGUAGES
|
||||
|
||||
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
|
||||
|
||||
def get_language_code(language):
|
||||
return lang2code.get(language.lower().strip(), '')
|
Loading…
Reference in New Issue