regluit/core/loaders/scrape.py

184 lines
6.5 KiB
Python

import re
import logging
import requests
from bs4 import BeautifulSoup
#from gitenberg.metadata.pandata import Pandata
from django.conf import settings
from urlparse import urljoin
from regluit.core import models
from regluit.core.validation import identifier_cleaner
logger = logging.getLogger(__name__)
CONTAINS_COVER = re.compile('cover')
CONTAINS_CC = re.compile('creativecommons.org')
class BaseScraper(object):
def __init__(self, url):
self.metadata = {}
self.identifiers = {'http': url}
self.doc = None
self.base = url
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
if response.status_code == 200:
self.doc = BeautifulSoup(response.content, 'lxml')
self.get_title()
self.get_language()
self.get_description()
self.get_identifiers()
self.get_keywords()
self.get_publisher()
self.get_pubdate()
self.get_authors()
self.get_cover()
self.get_downloads()
self.get_license()
if not self.metadata.get('title', None):
self.set('title', '!!! missing title !!!')
if not self.metadata.get('language', None):
self.set('language', 'en')
except requests.exceptions.RequestException as e:
logger.error(e)
self.metadata = None
self.metadata['identifiers'] = self.identifiers
def set(self, name, value):
self.metadata[name] = value
def fetch_one_el_content(self, el_name):
data_el = self.doc.find(el_name)
value = ''
if data_el:
value = data_el.text
return value
def check_metas(self, meta_list, **attrs):
value = ''
list_mode = attrs.pop('list_mode', 'longest')
for meta_name in meta_list:
attrs['name'] = meta_name
metas = self.doc.find_all('meta', attrs=attrs)
for meta in metas:
el_value = meta.get('content', '').strip()
if list_mode == 'longest':
if len(el_value) > len (value):
value = el_value
elif list_mode == 'list':
if value == '':
value = [el_value]
else:
value.append(el_value)
if value:
return value
return value
def get_title(self):
value = self.check_metas(['DC.Title','dc.title', 'citation_title', 'title'])
if not value:
value = self.fetch_one_el_content('title')
self.set('title', value)
def get_language(self):
value = self.check_metas(['DC.Language','dc.language','language'])
self.set('language', value)
def get_description(self):
value = self.check_metas(['DC.Description','dc.description','description'])
self.set('description', value)
def get_identifiers(self):
value = self.check_metas(['DC.Identifier.URI'])
value = identifier_cleaner('http')(value)
if value:
self.identifiers['http'] = value
value = self.check_metas(['DC.Identifier.DOI', 'citation_doi'])
value = identifier_cleaner('doi')(value)
if value:
self.identifiers['doi'] = value
isbns = {}
label_map = {'epub': 'EPUB', 'mobi': 'Mobi',
'paper': 'Paperback', 'pdf': 'PDF', 'hard':'Hardback'}
for key in label_map.keys():
isbn_key = 'isbn_{}'.format(key)
value = self.check_metas(['citation_isbn'], type=label_map[key])
value = identifier_cleaner('isbn')(value)
if value:
isbns[isbn_key] = value
self.identifiers[isbn_key] = value
ed_list = []
if len(isbns):
#need to create edition list
for key in isbns.keys():
isbn_type = key.split('_')[-1]
ed_list.append({
'edition_note': isbn_type,
'edition_identifiers': {'isbn': isbns[key]}
})
else:
value = self.check_metas(['citation_isbn'], list_mode='list')
if len(value):
for isbn in value:
isbn = identifier_cleaner('isbn')(isbn)
if isbn:
ed_list.append({
'_edition': isbn,
'edition_identifiers': {'isbn': isbn}
})
if len(ed_list):
self.set('edition_list', ed_list)
def get_keywords(self):
value = self.check_metas(['keywords']).strip(',;')
if value:
self.set('subjects', re.split(' *[;,] *', value))
def get_publisher(self):
value = self.check_metas(['citation_publisher', 'DC.Source'])
if value:
self.set('publisher', value)
def get_pubdate(self):
value = self.check_metas(['citation_publication_date', 'DC.Date.issued'])
if value:
self.set('publication_date', value)
def get_authors(self):
value_list = self.check_metas(['DC.Creator.PersonalName', 'citation_author',], list_mode='list')
if not value_list:
return
if len(value_list) == 1:
creator = {'author': {'agent_name': value_list[0]}}
else:
creator_list = []
for auth in value_list:
creator_list.append({'agent_name': auth})
creator = {'authors': creator_list }
self.set('creator', creator)
def get_cover(self):
block = self.doc.find(class_=CONTAINS_COVER)
block = block if block else self.doc
img = block.find_all('img', src=CONTAINS_COVER)
if img:
cover_uri = img[0].get('src', None)
if cover_uri:
self.set('covers', [{'image_url': urljoin(self.base, cover_uri)}])
def get_downloads(self):
for dl_type in ['epub', 'mobi', 'pdf']:
dl_meta = 'citation_{}_url'.format(dl_type)
value = self.check_metas([dl_meta])
if value:
self.set('download_url_{}'.format(dl_type), value)
def get_license(self):
'''only looks for cc licenses'''
links = self.doc.find_all(href=CONTAINS_CC)
for link in links:
self.set('rights_url', link['href'])