regluit/core/loaders/__init__.py

61 lines
1.8 KiB
Python
Executable File

import requests
from bs4 import BeautifulSoup
from django.conf import settings
from gitenberg.metadata.pandata import Pandata
from regluit.core.bookloader import add_from_bookdatas, BasePandataLoader
from .scrape import BaseScraper
from .hathitrust import HathitrustScraper
from .pressbooks import PressbooksScraper
from .springer import SpringerScraper
from .ubiquity import UbiquityScraper
from .smashwords import SmashwordsScraper
def get_scraper(url):
scrapers = [
PressbooksScraper,
SpringerScraper,
UbiquityScraper,
SmashwordsScraper,
HathitrustScraper,
BaseScraper,
]
for scraper in scrapers:
if scraper.can_scrape(url):
return scraper(url)
def scrape_sitemap(url, maxnum=None):
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
doc = BeautifulSoup(response.content, 'lxml')
for page in doc.find_all('loc')[0:maxnum]:
scraper = get_scraper(page.text)
if scraper.metadata.get('genre', None) == 'book':
yield scraper
except requests.exceptions.RequestException as e:
logger.error(e)
def add_by_webpage(url, work=None, user=None):
edition = None
scraper = get_scraper(url)
loader = BasePandataLoader(url)
pandata = Pandata()
pandata.metadata = scraper.metadata
for metadata in pandata.get_edition_list():
edition = loader.load_from_pandata(metadata, work)
work = edition.work
loader.load_ebooks(pandata, edition, user=user)
return edition if edition else None
def add_by_sitemap(url, maxnum=None):
return add_from_bookdatas(scrape_sitemap(url, maxnum=maxnum))
def scrape_language(url):
scraper = get_scraper(url)
return scraper.metadata.get('language')