regluit/core/loaders/ubiquity.py

42 lines
1.7 KiB
Python
Raw Normal View History

import re
2020-02-12 22:57:35 +00:00
from urllib.parse import urlparse, urljoin
2021-05-07 21:56:59 +00:00
from regluit.core.parameters import DOWNLOADABLE
2019-03-28 01:21:25 +00:00
from regluit.utils.lang import lang_to_language_code
from . import BaseScraper
HAS_EDS = re.compile(r'\(eds?\.\)')
UBIQUITY_HOSTS = ["ubiquitypress.com", "kriterium.se", "oa.finlit.fi", "humanities-map.net",
"oa.psupress.org", "larcommons.net", "uwestminsterpress.co.uk", "stockholmuniversitypress.se",
2018-07-12 16:56:09 +00:00
"luminosoa.org", "iitikship.iiti.ac.in", "aperio.press", "press.lse.ac.uk", "press.sjms.nu",
2020-06-25 18:21:37 +00:00
"publishing.vt.edu", "universitypress.whiterose.ac.uk", "www.winchesteruniversitypress.org",
"utsepress.lib.uts.edu.au", "www.mwv-open.de", "hup.fi",
]
class UbiquityScraper(BaseScraper):
can_scrape_hosts = UBIQUITY_HOSTS
def get_role(self):
descs = self.doc.select('section.book-description')
for desc in descs:
if desc.find(string=HAS_EDS):
return 'editor'
2018-02-20 18:07:44 +00:00
return super(UbiquityScraper, self).get_role()
def get_language(self):
langlabel = self.doc.find(string='Language')
2018-01-02 00:25:00 +00:00
lang = langlabel.parent.parent.find_next_sibling() if langlabel else ''
lang = lang.get_text() if lang else ''
2019-03-28 01:21:25 +00:00
lang = lang_to_language_code(lang) if lang else ''
if lang:
self.set('language', lang)
else:
2018-02-20 18:07:44 +00:00
super(UbiquityScraper, self).get_language()
2019-06-07 19:20:05 +00:00
def get_downloads(self):
2021-05-07 21:56:59 +00:00
for dl_type in DOWNLOADABLE:
2019-06-07 19:20:05 +00:00
dl_a = self.doc.find('a', attrs={'data-category': '{} download'.format(dl_type)})
if dl_a and 'href' in dl_a.attrs:
url = urljoin(self.base, dl_a['href'].strip())
self.set('download_url_{}'.format(dl_type), url)