Merge pull request #741 from Gluejar/improve_scrape

Improve scrape
pull/46/head
eshellman 2017-12-27 12:21:19 -05:00 committed by GitHub
commit fdb0aabe36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 80 additions and 38 deletions

View File

@ -8,9 +8,10 @@ from gitenberg.metadata.pandata import Pandata
from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader
from .scrape import PressbooksScraper, HathitrustScraper, BaseScraper
from .springer import SpringerScraper
from .ubiquity import UbiquityScraper
def get_scraper(url):
scrapers = [PressbooksScraper, HathitrustScraper, SpringerScraper, BaseScraper]
scrapers = [PressbooksScraper, HathitrustScraper, SpringerScraper, UbiquityScraper, BaseScraper]
for scraper in scrapers:
if scraper.can_scrape(url):
return scraper(url)

View File

@ -1,5 +1,6 @@
import re
import logging
from urlparse import urlparse
import requests
from bs4 import BeautifulSoup
#from gitenberg.metadata.pandata import Pandata
@ -18,8 +19,27 @@ CONTAINS_OCLCNUM = re.compile('worldcat.org/oclc/(\d+)')
class BaseScraper(object):
'''
designed to make at least a decent gues for webpages that embed metadata
designed to make at least a decent guess for webpages that embed metadata
'''
can_scrape_hosts = False
can_scrape_strings = [''] #should always return true
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
if not (cls.can_scrape_hosts or cls.can_scrape_strings):
return True
if cls.can_scrape_hosts:
urlhost = urlparse(url).hostname
if urlhost:
for host in cls.can_scrape_hosts:
if urlhost.endswith(host):
return True
if cls.can_scrape_strings:
for pass_str in cls.can_scrape_strings:
if url.find(pass_str) >= 0:
return True
return False
def __init__(self, url):
self.metadata = {}
self.identifiers = {'http': url}
@ -286,12 +306,12 @@ class BaseScraper(object):
for link in links:
self.set('rights_url', link['href'])
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return True
class PressbooksScraper(BaseScraper):
can_scrape_hosts = ['bookkernel.com', 'milnepublishing.geneseo.edu',
'press.rebus.community', 'pb.unizin.org']
can_scrape_strings = ['pressbooks']
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
download_el = self.doc.select_one('.{}'.format(dl_type))
@ -328,19 +348,12 @@ class PressbooksScraper(BaseScraper):
isbns[key] = isbn
return isbns
@classmethod
def can_scrape(cls, url):
pb_sites = ['bookkernel.com','milnepublishing.geneseo.edu', 'pressbooks',
'press.rebus.community','pb.unizin.org']
''' return True if the class can scrape the URL '''
for site in pb_sites:
if url.find(site) > 0:
return True
return False
class HathitrustScraper(BaseScraper):
can_scrape_hosts = ['hathitrust.org']
can_scrape_strings = ['hdl.handle.net/2027/']
CATALOG = re.compile(r'catalog.hathitrust.org/Record/(\d+)')
def setup(self):
@ -388,8 +401,3 @@ class HathitrustScraper(BaseScraper):
def get_genre(self):
self.set('genre', self.record.get('type_of_reference', '').lower())
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return url.find('hathitrust.org') > 0 or url.find('hdl.handle.net/2027/') > 0

View File

@ -1,8 +1,9 @@
import re
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
import requests
from bs4 import BeautifulSoup
from django.conf import settings
from regluit.core.validation import identifier_cleaner
@ -14,11 +15,12 @@ MENTIONS_CC = re.compile(r'CC BY(-NC)?(-ND|-SA)?', flags=re.I)
HAS_YEAR = re.compile(r'(19|20)\d\d')
class SpringerScraper(BaseScraper):
can_scrape_strings =['10.1007', '10.1057']
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
download_el = self.doc.find('a', title=re.compile(dl_type.upper()))
if download_el:
value = download_el.get('href')
value = download_el.get('href')
if value:
value = urljoin(self.base, value)
self.set('download_url_{}'.format(dl_type), value)
@ -31,7 +33,7 @@ class SpringerScraper(BaseScraper):
text = div.get_text() if hasattr(div, 'get_text') else div.string
if text:
text = text.replace(u'\xa0', u' ')
value = u'{}<p>{}</p>'.format(value, text)
value = u'{}<p>{}</p>'.format(value, text)
self.set('description', value)
def get_keywords(self):
@ -42,7 +44,7 @@ class SpringerScraper(BaseScraper):
if 'Open Access' in value:
value.remove('Open Access')
self.set('subjects', value)
def get_identifiers(self):
super(SpringerScraper, self).get_identifiers()
el = self.doc.select_one('#doi-url')
@ -64,7 +66,7 @@ class SpringerScraper(BaseScraper):
if value:
isbns['electronic'] = value
return isbns
def get_title(self):
el = self.doc.select_one('#book-title')
value = ''
@ -75,7 +77,7 @@ class SpringerScraper(BaseScraper):
self.set('title', value)
if not value:
super(SpringerScraper, self).get_title()
def get_role(self):
if self.doc.select_one('#editors'):
return 'editor'
@ -84,19 +86,19 @@ class SpringerScraper(BaseScraper):
def get_author_list(self):
for el in self.doc.select('.authors__name'):
yield el.text.strip().replace(u'\xa0', u' ')
def get_license(self):
'''only looks for cc licenses'''
links = self.doc.find_all(href=CONTAINS_CC)
for link in links:
self.set('rights_url', link['href'])
return
mention = self.doc.find(string=MENTIONS_CC)
mention = self.doc.find(string=MENTIONS_CC)
if mention:
lic = MENTIONS_CC.search(mention).group(0)
lic_url = 'https://creativecommons.org/licenses/{}/'.format(lic[3:].lower())
self.set('rights_url', lic_url)
def get_pubdate(self):
pubinfo = self.doc.select_one('#copyright-info')
if pubinfo:
@ -107,12 +109,6 @@ class SpringerScraper(BaseScraper):
def get_publisher(self):
self.set('publisher', 'Springer')
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return url.find('10.1007') >= 0 or url.find('10.1057') >= 0
search_url = 'https://link.springer.com/search/page/{}?facet-content-type=%22Book%22&package=openaccess'
def load_springer(num_pages):
def springer_open_books(num_pages):

31
core/loaders/ubiquity.py Normal file
View File

@ -0,0 +1,31 @@
import re
from urlparse import urlparse
from regluit.utils.lang import get_language_code
from . import BaseScraper
HAS_EDS = re.compile(r'\(eds?\.\)')
UBIQUITY_HOSTS = ["ubiquitypress.com", "kriterium.se", "oa.finlit.fi", "humanities-map.net",
"oa.psupress.org", "larcommons.net", "uwestminsterpress.co.uk", "stockholmuniversitypress.se",
"luminosoa.org",
]
class UbiquityScraper(BaseScraper):
can_scrape_hosts = UBIQUITY_HOSTS
def get_role(self):
descs = self.doc.select('section.book-description')
for desc in descs:
if desc.find(string=HAS_EDS):
return 'editor'
return super(self, UbiquityScraper).get_role()
def get_language(self):
langlabel = self.doc.find(string='Language')
lang = langlabel.parent.parent.find_next_sibling()
lang = lang.get_text() if lang else ''
lang = get_language_code(lang) if lang else ''
if lang:
self.set('language', lang)
else:
super(self, UbiquityScraper).get_language()

View File

@ -23,7 +23,7 @@ class Command(BaseCommand):
ebf = ebook.get_archive_ebf()
if ebf:
try:
print 'making mobi for {}'.format(work.title)
print u'making mobi for {}'.format(work.title)
if ebf.make_mobi():
print 'made mobi'
i = i + 1

6
utils/lang.py Normal file
View File

@ -0,0 +1,6 @@
from django.conf.global_settings import LANGUAGES
lang2code = dict([ (lang[1].lower(), lang[0]) for lang in LANGUAGES ])
def get_language_code(language):
return lang2code.get(language.lower().strip(), '')