Merge pull request #710 from Gluejar/hathitrust-scrape

Hathitrust scraper
pull/43/head
eshellman 2017-10-27 12:09:40 -04:00 committed by GitHub
commit e23a885f8a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 99 additions and 12 deletions

View File

@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
#from gitenberg.metadata.pandata import Pandata
from django.conf import settings
from urlparse import urljoin
from RISparser import read as readris
from regluit.core import models
from regluit.core.validation import identifier_cleaner, authlist_cleaner
@ -13,6 +14,7 @@ logger = logging.getLogger(__name__)
CONTAINS_COVER = re.compile('cover')
CONTAINS_CC = re.compile('creativecommons.org')
CONTAINS_OCLCNUM = re.compile('worldcat.org/oclc/(\d+)')
class BaseScraper(object):
'''
@ -26,7 +28,9 @@ class BaseScraper(object):
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
self.base = response.url
self.doc = BeautifulSoup(response.content, 'lxml')
self.setup()
self.get_genre()
self.get_title()
self.get_language()
@ -94,7 +98,18 @@ class BaseScraper(object):
dt = self.doc.find('dt', string=re.compile(name))
dd = dt.find_next_sibling('dd') if dt else None
return dd.text if dd else None
def get_itemprop(self, name):
value_list = []
attrs = {'itemprop': name}
props = self.doc.find_all(attrs=attrs)
for el in props:
value_list.append(el.text)
return value_list
def setup(self):
# use this method to get auxiliary resources based on doc
pass
#
# getters
#
@ -105,7 +120,7 @@ class BaseScraper(object):
self.set('genre', 'book')
def get_title(self):
value = self.check_metas(['DC.Title', 'dc.title', 'citation_title', 'title'])
value = self.check_metas(['DC.Title', 'dc.title', 'citation_title', 'og:title', 'title'])
if not value:
value = self.fetch_one_el_content('title')
self.set('title', value)
@ -150,7 +165,17 @@ class BaseScraper(object):
if value:
self.identifiers['doi'] = value
isbns = self.get_isbns()
#look for oclc numbers
links = self.doc.find_all(href=CONTAINS_OCLCNUM)
for link in links:
oclcmatch = CONTAINS_OCLCNUM.search(link['href'])
if oclcmatch:
value = identifier_cleaner('oclc')(oclcmatch.group(1))
if value:
self.identifiers['oclc'] = value
break
isbns = self.get_isbns()
ed_list = []
if len(isbns):
#need to create edition list
@ -195,7 +220,9 @@ class BaseScraper(object):
'author',
], list_mode='list')
if not value_list:
return
value_list = self.get_itemprop('author')
if not value_list:
return
creator_list = []
value_list = authlist_cleaner(value_list)
if len(value_list) == 1:
@ -207,16 +234,16 @@ class BaseScraper(object):
self.set('creator', {'authors': creator_list })
def get_cover(self):
image_url = self.check_metas(['og.image', 'image'])
image_url = self.check_metas(['og.image', 'image', 'twitter:image'])
if not image_url:
block = self.doc.find(class_=CONTAINS_COVER)
block = block if block else self.doc
img = block.find_all('img', src=CONTAINS_COVER)
if img:
cover_uri = img[0].get('src', None)
if cover_uri:
image_url = urljoin(self.base, cover_uri)
if image_url:
if not image_url.startswith('http'):
image_url = urljoin(self.base, image_url)
self.set('covers', [{'image_url': image_url}])
def get_downloads(self):
@ -279,8 +306,65 @@ class PressbooksScraper(BaseScraper):
''' return True if the class can scrape the URL '''
return url.find('press.rebus.community') > 0 or url.find('pressbooks.com') > 0
class HathitrustScraper(BaseScraper):
CATALOG = re.compile(r'catalog.hathitrust.org/Record/(\d+)')
def setup(self):
catalog_a = self.doc.find('a', href=self.CATALOG)
if catalog_a:
catalog_num = self.CATALOG.search(catalog_a['href']).group(1)
ris_url = 'https://catalog.hathitrust.org/Search/SearchExport?handpicked={}&method=ris'.format(catalog_num)
response = requests.get(ris_url, headers={"User-Agent": settings.USER_AGENT})
records = readris(response.text.splitlines()) if response.status_code == 200 else []
for record in records:
self.record = record
return
self.record = {}
def get_downloads(self):
dl_a = self.doc.select_one('#fullPdfLink')
value = dl_a['href'] if dl_a else None
if value:
self.set(
'download_url_{}'.format('pdf'),
'https://babel.hathitrust.org{}'.format(value)
)
def get_isbns(self):
isbn = self.record.get('issn', [])
value = identifier_cleaner('isbn')(isbn)
return {'print': value} if value else {}
def get_title(self):
self.set('title', self.record.get('title', ''))
def get_keywords(self):
self.set('subjects', self.record.get('keywords', []))
def get_publisher(self):
self.set('publisher', self.record.get('publisher', ''))
def get_pubdate(self):
self.set('publication_date', self.record.get('year', ''))
def get_description(self):
notes = self.record.get('notes', [])
self.set('description', '\r'.join(notes))
def get_genre(self):
self.set('genre', self.record.get('type_of_reference', '').lower())
@classmethod
def can_scrape(cls, url):
''' return True if the class can scrape the URL '''
return url.find('hathitrust.org') > 0 or url.find('hdl.handle.net/2027/') > 0
def get_scraper(url):
scrapers = [PressbooksScraper, BaseScraper]
scrapers = [PressbooksScraper, HathitrustScraper, BaseScraper]
for scraper in scrapers:
if scraper.can_scrape(url):
return scraper(url)

View File

@ -48,7 +48,7 @@ def isbn_cleaner(value):
return value
isbn=ISBN(value)
if isbn.error:
raise forms.ValidationError(isbn.error)
raise ValidationError(isbn.error)
isbn.validate()
return isbn.to_string()
@ -94,18 +94,18 @@ def test_file(the_file, fformat):
try:
book = EPUB(the_file.file)
except Exception as e:
raise forms.ValidationError(_('Are you sure this is an EPUB file?: %s' % e) )
raise ValidationError(_('Are you sure this is an EPUB file?: %s' % e) )
elif fformat == 'mobi':
try:
book = Mobi(the_file.file)
book.parse()
except Exception as e:
raise forms.ValidationError(_('Are you sure this is a MOBI file?: %s' % e) )
raise ValidationError(_('Are you sure this is a MOBI file?: %s' % e) )
elif fformat == 'pdf':
try:
doc = PdfFileReader(the_file.file)
except Exception, e:
raise forms.ValidationError(_('%s is not a valid PDF file' % the_file.name) )
raise ValidationError(_('%s is not a valid PDF file' % the_file.name) )
return True
def valid_xml_char_ordinal(c):
@ -132,6 +132,7 @@ def valid_subject( subject_name ):
reverse_name_comma = re.compile(r',(?! *Jr[\., ])')
def unreverse_name(name):
name = name.strip('.')
if not reverse_name_comma.search(name):
return name
(last, rest) = name.split(',', 1)
@ -157,12 +158,13 @@ comma_list_delim = re.compile(r',(?! *Jr[\., ])')
spaces = re.compile(r'\s+')
_and_ = re.compile(r',? (and|\&) ')
semicolon_list_delim = re.compile(r'[\;|\&]')
reversed_name = re.compile(r'(de |la |los |von |van )*\w+, \w+.?( \w+.?)?(, Jr\.?)?')
def auth_cleaner(auth):
''' given a author string checks that the author string
is not a list of author names'''
cleaned = []
if ';' in auth:
if ';' in auth or reversed_name.match(auth):
authlist = semicolon_list_delim.split(auth)
authlist = [unreverse_name(name) for name in authlist]
else:

View File

@ -103,3 +103,4 @@ pycparser==2.14
setuptools==25.0.0
urllib3==1.16
beautifulsoup4==4.6.0
RISparser==0.4.2